diff --git a/.github/actions/openvino_provider/action.yml b/.github/actions/openvino_provider/action.yml index dd1078bb0d4353..a17986f35d3723 100644 --- a/.github/actions/openvino_provider/action.yml +++ b/.github/actions/openvino_provider/action.yml @@ -177,7 +177,7 @@ runs: else ov_package_url=$(curl -s ${{ inputs.nightly_package_source }} | jq -r '.${{ inputs.platform }}_${{ inputs.arch }}') fi - cd ${{ inputs.install_dir || env.GITHUB_WORKSPACE }} + cd ${{ inputs.install_dir || github.workspace }} package_basename=$(basename $ov_package_url) wget $ov_package_url --progress=bar:force:noscroll -O $package_basename package_folder=${package_basename%.*} @@ -196,7 +196,7 @@ runs: uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: ${{ steps.openvino_s3_download.outputs.ov_artifact_name }} - path: ${{ steps.openvino_s3_download.outputs.ov_package_path }} + path: ${{ github.workspace }}/${{ steps.openvino_s3_download.outputs.ov_package_path }} if-no-files-found: 'error' - name: Get wheel diff --git a/.github/actions/openvino_provider/get_s3_package.py b/.github/actions/openvino_provider/get_s3_package.py index df253a422421ec..02ea99cb2f3403 100644 --- a/.github/actions/openvino_provider/get_s3_package.py +++ b/.github/actions/openvino_provider/get_s3_package.py @@ -54,6 +54,10 @@ def main(product, version_pattern, platform, arch, folder): matching_files = filter_files_by_criteria(all_files, product, version_pattern, platform, arch, folder) if matching_files: logger.info(f"Matching packages: {sorted(matching_files)}") + if len(matching_files) > 1: + custom_release_build_pattern = fr".*/{version_pattern}/(linux_|windows_|macos_).*/.*" + # Exclude custom release builds, if any, from matches + matching_files = [file for file in matching_files if not re.search(custom_release_build_pattern, file)] package_url = f"https://storage.openvinotoolkit.org{sorted(matching_files)[-1]}" logger.info(f"Returning package URL: {package_url}") action_utils.set_github_output("package_url", package_url) diff --git a/.github/components.yml b/.github/components.yml index 8de51a2ced3343..31952e2b87c114 100644 --- a/.github/components.yml +++ b/.github/components.yml @@ -149,6 +149,7 @@ PyTorch_FE: build: - CPU - Python_API + - TOKENIZERS # PyTorch_FE tests depend on tokenizers build JAX_FE: revalidate: @@ -174,6 +175,7 @@ Python_API: - OVC - tools - TF_FE + - docs_snippets build: - CPU - HETERO @@ -242,6 +244,12 @@ tools: docs: revalidate: [] build: [] + +docs_snippets: + revalidate: + - docs_snippets + build: + - Python_API licensing: revalidate: [] diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 359ff683c9b22a..5eac7709e32703 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -141,3 +141,18 @@ updates: - "mryzhov" - "ilya-lavrenov" open-pull-requests-limit: 3 + + # Docker images + - package-ecosystem: docker + directory: "/" + schedule: + interval: "daily" + time: "09:00" + timezone: "Asia/Dubai" + allow: + - dependency-type: "direct" + assignees: + - "akashchi" + - "mryzhov" + - "ilya-lavrenov" + open-pull-requests-limit: 3 \ No newline at end of file diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag index 3783a7e8d5600a..bcfa07fb5c24b3 100644 --- a/.github/dockerfiles/docker_tag +++ b/.github/dockerfiles/docker_tag @@ -1 +1 @@ -pr-27430 +pr-27597 diff --git a/.github/dockerfiles/ov_test/debian_10_py310/Dockerfile b/.github/dockerfiles/ov_test/debian_10_py310/Dockerfile new file mode 100644 index 00000000000000..e7dbadf5a414ba --- /dev/null +++ b/.github/dockerfiles/ov_test/debian_10_py310/Dockerfile @@ -0,0 +1,76 @@ +ARG REGISTRY="docker.io" +FROM ${REGISTRY}/library/debian:10.13 + +USER root + +# APT configuration +RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \ + echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf + +ENV DEBIAN_FRONTEND="noninteractive" \ + TZ="Europe/London" + +RUN apt-get update && \ + apt-get install \ + git \ + libc6-dev \ + # parallel gzip + pigz \ + # Python + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + python3-distutils \ + # To build Python 3.10 from source + build-essential \ + libffi-dev \ + libgdbm-dev \ + libc6-dev \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libxml2-dev \ + libxmlsec1-dev \ + liblzma-dev \ + wget \ + curl \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install openvino dependencies +ADD scripts/install_dependencies/install_openvino_dependencies.sh /install_openvino_dependencies.sh +RUN chmod +x /install_openvino_dependencies.sh && \ + /install_openvino_dependencies.sh && \ + rm -rf /var/lib/apt/lists/* + +# Setup Python 3.10 +RUN wget https://www.python.org/ftp/python/3.10.9/Python-3.10.9.tar.xz + +RUN tar -xf Python-3.10.9.tar.xz && \ + cd Python-3.10.9 && \ + ./configure --enable-optimizations && \ + make -j 8 && \ + make altinstall + +# Setup pip +ENV PIP_VERSION="24.0" +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3.10 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + rm -f get-pip.py + +# Use Python 3.10 as default instead of Python 3.7 +# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build +RUN python3.10 -m venv venv +ENV PATH="/venv/bin:$PATH" + +ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION} +ENV PIP_INSTALL_PATH=/venv/lib/python3.10/site-packages diff --git a/.github/dockerfiles/ov_test/fedora_33/Dockerfile b/.github/dockerfiles/ov_test/fedora_33/Dockerfile index c059c82c7d3cf2..6e0fcc7d35156b 100644 --- a/.github/dockerfiles/ov_test/fedora_33/Dockerfile +++ b/.github/dockerfiles/ov_test/fedora_33/Dockerfile @@ -6,7 +6,13 @@ USER root RUN yum update -y && yum install -y \ git \ curl \ - python3 + python3 \ + findutils \ + ocl-icd \ + ocl-icd-devel \ + # parallel gzip + pigz \ + xz # Install Node ENV NODE_VERSION=21.7.3 diff --git a/.github/dockerfiles/ov_test/ubuntu_20_04_x64_py313/Dockerfile b/.github/dockerfiles/ov_test/ubuntu_20_04_x64_py313/Dockerfile new file mode 100644 index 00000000000000..b6b99f81305dee --- /dev/null +++ b/.github/dockerfiles/ov_test/ubuntu_20_04_x64_py313/Dockerfile @@ -0,0 +1,52 @@ +ARG REGISTRY="docker.io" +FROM ${REGISTRY}/library/ubuntu:20.04 + +USER root + +# APT configuration +RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \ + echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf + +ENV DEBIAN_FRONTEND="noninteractive" \ + TZ="Europe/London" + +RUN apt-get update && \ + apt-get install software-properties-common && \ + add-apt-repository --yes --no-update ppa:git-core/ppa && \ + add-apt-repository --yes --no-update ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install \ + curl \ + git \ + gpg-agent \ + tzdata \ + # parallel gzip + pigz \ + # Python + python3.13-dev \ + python3.13-venv \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install openvino dependencies +ADD scripts/install_dependencies/install_openvino_dependencies.sh /install_openvino_dependencies.sh +RUN chmod +x /install_openvino_dependencies.sh && \ + /install_openvino_dependencies.sh && \ + rm -rf /var/lib/apt/lists/* + +# Setup pip +ENV PIP_VERSION="24.0" +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + python3.13 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + rm -f get-pip.py + +# Use Python 3.13 as default instead of Python 3.8 +# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build +RUN python3.13 -m venv venv +ENV PATH="/venv/bin:$PATH" + +ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION} +ENV PIP_INSTALL_PATH=/venv/lib/python3.13/site-packages diff --git a/.github/labeler.yml b/.github/labeler.yml index e9b2acb26c9072..cb05d3dea36960 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -57,10 +57,12 @@ 'category: docs': - '**/*.md' - any: ['docs/**/*', - '!docs/snippets/**/*'] + '!docs/snippets/**/*', + '!docs/articles_en/assets/snippets/**/*'] 'category: docs_snippets': - 'docs/snippets/**/*' +- 'docs/articles_en/assets/snippets/**/*' 'category: extensions': - 'src/core/include/openvino/core/extension.hpp' diff --git a/.github/scripts/workflow_rerun/errors_to_look_for.json b/.github/scripts/workflow_rerun/errors_to_look_for.json index b1074fa43e7200..ad771e9d51f75d 100644 --- a/.github/scripts/workflow_rerun/errors_to_look_for.json +++ b/.github/scripts/workflow_rerun/errors_to_look_for.json @@ -8,7 +8,7 @@ "ticket": 135715 }, { - "error_text": "GnuTLS recv error", + "error_text": "error: RPC failed", "ticket": 131918 }, { diff --git a/.github/workflows/android_arm64.yml b/.github/workflows/android_arm64.yml index e0954871f4b51e..b760d9746d7842 100644 --- a/.github/workflows/android_arm64.yml +++ b/.github/workflows/android_arm64.yml @@ -25,6 +25,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -54,6 +55,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -99,6 +101,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' @@ -117,6 +120,7 @@ jobs: - name: Clone vcpkg uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'microsoft/vcpkg' ref: ${{ env.VCPKG_VERSION }} diff --git a/.github/workflows/android_x64.yml b/.github/workflows/android_x64.yml index b0b46c662abdbb..efd14541010730 100644 --- a/.github/workflows/android_x64.yml +++ b/.github/workflows/android_x64.yml @@ -28,6 +28,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -57,6 +58,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -98,12 +100,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO GenAI uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino.genai' path: ${{ env.OPENVINO_GENAI_REPO }} diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml index 8c78375e61769c..c0dac9816598e1 100644 --- a/.github/workflows/build_doc.yml +++ b/.github/workflows/build_doc.yml @@ -19,6 +19,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: submodules: 'true' lfs: 'true' diff --git a/.github/workflows/check_pr_commits.yml b/.github/workflows/check_pr_commits.yml index f7f66be299876c..91d6a2a497a8cd 100644 --- a/.github/workflows/check_pr_commits.yml +++ b/.github/workflows/check_pr_commits.yml @@ -10,6 +10,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - name: Install dependencies run: python3 -m pip install -r ./.github/github_org_control/requirements.txt diff --git a/.github/workflows/cleanup_caches.yml b/.github/workflows/cleanup_caches.yml index 3fc69b21374093..d6633fd9dab3ee 100644 --- a/.github/workflows/cleanup_caches.yml +++ b/.github/workflows/cleanup_caches.yml @@ -49,6 +49,7 @@ jobs: steps: - name: Checkout cach action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/cache @@ -71,6 +72,7 @@ jobs: steps: - name: Checkout cach action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/cache diff --git a/.github/workflows/code_snippets.yml b/.github/workflows/code_snippets.yml index 9337fdff4b2905..5916f91447abc9 100644 --- a/.github/workflows/code_snippets.yml +++ b/.github/workflows/code_snippets.yml @@ -29,6 +29,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: submodules: 'true' diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml index a70d2641cb57f3..3969da2b97c5a1 100644 --- a/.github/workflows/code_style.yml +++ b/.github/workflows/code_style.yml @@ -15,6 +15,7 @@ jobs: pull-requests: write steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: submodules: 'true' @@ -38,6 +39,36 @@ jobs: level: warning fail_on_error: true + clang-format-aarch64: + runs-on: ubuntu-22.04 + if: ${{ github.repository_owner == 'openvinotoolkit' }} + permissions: + pull-requests: write + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: 'true' + + - name: Install clang-format-15 and cross-compilation dependencies + run: | + sudo apt update + sudo apt --assume-yes install binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu scons clang-format-15 + + # Run cmake with -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT in order to enable codestyle check for ITT collector + - name: CMake configure + run: cmake -DENABLE_CLANG_FORMAT=ON -DENABLE_TESTS=ON -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT -DCMAKE_TOOLCHAIN_FILE=cmake/arm64.toolchain.cmake -B build_arm64 + + - name: Create code style diff + run: cmake --build build_arm64 --target clang_format_fix_all -j8 + + - name: suggester / clang-format + if: startsWith(github.event_name, 'pull_request') + uses: reviewdog/action-suggester@db4abb16fbaabe386831e5addb7be1485d0d63d3 # v1.18.0 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + level: warning + fail_on_error: true + ShellCheck: runs-on: ubuntu-22.04 if: ${{ github.repository_owner == 'openvinotoolkit' }} @@ -45,6 +76,7 @@ jobs: pull-requests: write steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: submodules: 'true' @@ -77,6 +109,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: submodules: 'true' diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index cde1b9cf67e2fc..fd6a029abfaa67 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -33,6 +33,7 @@ jobs: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: submodules: 'true' diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 5a08ec084dadac..52ac10c9a6882a 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -35,6 +35,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -63,6 +64,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -98,6 +100,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' @@ -105,6 +108,7 @@ jobs: - name: Clone OpenVINO Contrib uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} diff --git a/.github/workflows/debian_10_arm.yml b/.github/workflows/debian_10_arm.yml index 73426222253adb..20b1daa0c5dc8d 100644 --- a/.github/workflows/debian_10_arm.yml +++ b/.github/workflows/debian_10_arm.yml @@ -25,6 +25,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -49,7 +50,7 @@ jobs: Docker: needs: Smart_CI if: "!needs.smart_ci.outputs.skip_workflow" - runs-on: aks-linux-16-cores-arm-docker-build + runs-on: aks-linux-4-cores-8gb-arm-docker-build container: image: openvinogithubactions.azurecr.io/docker_build:0.2 volumes: @@ -59,6 +60,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -75,7 +77,7 @@ jobs: if: "!needs.smart_ci.outputs.skip_workflow" uses: ./.github/workflows/job_build_linux.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.debian_10_arm }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} event-name: ${{ github.event_name }} @@ -104,7 +106,7 @@ jobs: needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_cxx_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_arm }} affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'debian_10' @@ -116,6 +118,8 @@ jobs: needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_cpu_functional_tests.yml with: + # Additional investigation needed why CPU functional tests are failing on v6 VM size's version, + # so leave it as it is for now runner: 'aks-linux-16-cores-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_arm }} python-version: '3.7' diff --git a/.github/workflows/dependency_review.yml b/.github/workflows/dependency_review.yml index 59a1eaa6e1c26f..690c789cb65222 100644 --- a/.github/workflows/dependency_review.yml +++ b/.github/workflows/dependency_review.yml @@ -10,6 +10,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - name: Dependency Review uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3 diff --git a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml index ba458da5d3ec1a..5ed82e8330778c 100644 --- a/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml +++ b/.github/workflows/dev_cpu_linux_snippets_libxsmm.yml @@ -33,6 +33,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -66,6 +67,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -110,6 +112,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' @@ -296,6 +299,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/send_workflows_to_opentelemetry.yml b/.github/workflows/export_workflow_metrics.yml similarity index 55% rename from .github/workflows/send_workflows_to_opentelemetry.yml rename to .github/workflows/export_workflow_metrics.yml index ba38d6a9f90fed..39bb699b8caa91 100644 --- a/.github/workflows/send_workflows_to_opentelemetry.yml +++ b/.github/workflows/export_workflow_metrics.yml @@ -1,40 +1,38 @@ -name: Export workflow metrics (BETA) +name: Export workflow metrics on: workflow_run: workflows: - - Android ARM64 with vcpkg - - Android x64 - - Documentation - - Cleanup PIP caches - - Code snippets - - Code Style - - Code coverage - - Coverity (Ubuntu 20.04, Python 3.11) - - Debian 10 ARM - - Fedora 29 (RHEL 8.4), Python 3.9 - - Linux (Ubuntu 20.04, Python 3.9) - - Linux (Ubuntu 22.04, Python 3.11) - - Linux (Ubuntu 24.04, Python 3.12) - - Linux ARM64 (Ubuntu 20.04, Python 3.11) - - Linux Static CC (Ubuntu 22.04, Python 3.11, Clang) - - Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10) - - macOS (Python 3.11) - - macOS ARM64 (Python 3.11) - - MO - - Python API Checks - - Webassembly - - Windows (VS 2019, Python 3.11, Release) - - Windows (VS 2019, Python 3.11, Debug) - - Windows Conditional Compilation (VS 2022, Python 3.11) - - Rerun Workflow with Known Errors + - "Android ARM64 with vcpkg" + - "Android x64" + - "Cleanup caches" + - "Coverity (Ubuntu 20.04, Python 3.11)" + - "Debian 10 ARM" + - "Fedora 29 (RHEL 8.4), Python 3.9" + - "Linux (Ubuntu 20.04, Python 3.9)" + - "Linux (Ubuntu 22.04, Python 3.11)" + - "Linux (Ubuntu 24.04, Python 3.12)" + - "Linux ARM64 (Ubuntu 20.04, Python 3.11)" + - "Linux Static CC (Ubuntu 22.04, Python 3.11, Clang)" + - "Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10)" + - "Linux (Ubuntu 22.04, Python 3.11, Intel DPC\\+\\+ Compiler)" + - "Linux CPU Plugin Snippets with LIBXSMM (Ubuntu 20.04)" + - "Linux Sanitizers (Ubuntu 20.04, Python 3.9)" + - "macOS (Python 3.11)" + - "macOS ARM64 (Python 3.11)" + - "Manylinux 2014" + - "Webassembly" + - "Windows (VS 2019, Python 3.11, Release)" + - "Windows (VS 2019, Python 3.11, Debug)" + - "Windows Conditional Compilation (VS 2022, Python 3.11)" + - "Rerun Workflow with Known Errors" types: - completed permissions: read-all jobs: - otel-export-trace: + export-workflow-metrics: name: Export finished workflow metrics runs-on: aks-linux-2-cores-8gb if: ${{ github.repository_owner == 'openvinotoolkit' }} @@ -42,6 +40,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: '.github' diff --git a/.github/workflows/fedora_29.yml b/.github/workflows/fedora_29.yml index f3b101327f76dc..0dd101225dc533 100644 --- a/.github/workflows/fedora_29.yml +++ b/.github/workflows/fedora_29.yml @@ -25,6 +25,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -59,6 +60,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/files_size.yml b/.github/workflows/files_size.yml index 2768e731b6578b..c263afed1fe465 100644 --- a/.github/workflows/files_size.yml +++ b/.github/workflows/files_size.yml @@ -13,6 +13,7 @@ jobs: if: ${{ github.repository_owner == 'openvinotoolkit' }} steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - name: git ls-tree run: git ls-tree -r -t -l --full-name HEAD | sort -n -r -k 4 diff --git a/.github/workflows/job_build_linux.yml b/.github/workflows/job_build_linux.yml index 3964f049be2abb..c56de5872cc2df 100644 --- a/.github/workflows/job_build_linux.yml +++ b/.github/workflows/job_build_linux.yml @@ -92,6 +92,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' @@ -107,6 +108,7 @@ jobs: - name: Clone OpenVINO Contrib uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} diff --git a/.github/workflows/job_build_windows.yml b/.github/workflows/job_build_windows.yml index 7b682f208c3435..4e3969d978cb83 100644 --- a/.github/workflows/job_build_windows.yml +++ b/.github/workflows/job_build_windows.yml @@ -60,12 +60,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO Contrib uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_contrib' path: 'openvino_contrib' diff --git a/.github/workflows/job_cpu_functional_tests.yml b/.github/workflows/job_cpu_functional_tests.yml index 0366ec47ff437e..568c33d39e307b 100644 --- a/.github/workflows/job_cpu_functional_tests.yml +++ b/.github/workflows/job_cpu_functional_tests.yml @@ -72,6 +72,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_jax_models_tests.yml b/.github/workflows/job_jax_models_tests.yml index 43fa8f2a7f1740..07155db1016057 100644 --- a/.github/workflows/job_jax_models_tests.yml +++ b/.github/workflows/job_jax_models_tests.yml @@ -65,6 +65,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_onnx_runtime.yml b/.github/workflows/job_onnx_runtime.yml index df50c4f3e2ad3c..92f86511e99e4a 100644 --- a/.github/workflows/job_onnx_runtime.yml +++ b/.github/workflows/job_onnx_runtime.yml @@ -64,6 +64,7 @@ jobs: - name: Fetch ONNX runtime version and skip tests list uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | src/frontends/onnx/tests/ci_utils/onnxruntime @@ -78,6 +79,7 @@ jobs: - name: Clone ONNX Runtime uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'microsoft/onnxruntime' path: ${{ env.ONNX_RUNTIME_REPO }} diff --git a/.github/workflows/job_openvino_js.yml b/.github/workflows/job_openvino_js.yml index ecb278fdb54ca3..fd04d8842daae7 100644 --- a/.github/workflows/job_openvino_js.yml +++ b/.github/workflows/job_openvino_js.yml @@ -33,6 +33,7 @@ jobs: steps: - name: Fetch OpenVINO JS sources uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | src/bindings/js diff --git a/.github/workflows/job_python_api_tests.yml b/.github/workflows/job_python_api_tests.yml new file mode 100644 index 00000000000000..e12001cd67afba --- /dev/null +++ b/.github/workflows/job_python_api_tests.yml @@ -0,0 +1,151 @@ +name: Python API tests + +on: + workflow_call: + inputs: + runner: + description: 'Machine on which the tests would run' + type: string + required: true + container: + description: 'JSON to be converted to the value of the "container" configuration for the job' + type: string + required: false + default: '{"image": null}' + affected-components: + description: 'Components that are affected by changes in the commit defined by the Smart CI Action' + type: string + required: true + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true + +permissions: read-all + +env: + PIP_CACHE_PATH: /mount/caches/pip/linux + +jobs: + Python_Unit_Tests: + name: Python API tests + if: ${{ fromJSON(inputs.affected-components).Python_API.test || fromJSON(inputs.affected-components).docs_snippets.test }} + timeout-minutes: 30 + runs-on: ${{ inputs.runner }} + container: ${{ fromJSON(inputs.container) }} + defaults: + run: + shell: bash + env: + DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input + OPENVINO_REPO: ${{ github.workspace }}/openvino + INSTALL_DIR: ${{ github.workspace }}/install + INSTALL_TEST_DIR: ${{ github.workspace }}/install/openvino_tests + INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/openvino_wheels + steps: + - name: Download OpenVINO artifacts (tarballs and wheels) + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + pattern: openvino_@(wheels|tests) + path: ${{ env.INSTALL_DIR }} + + # Needed as ${{ github.workspace }} is not working correctly when using Docker + - name: Setup Variables + run: | + echo "OPENVINO_REPO=$GITHUB_WORKSPACE/openvino" >> "$GITHUB_ENV" + echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" + echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/openvino_tests" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/openvino_wheels" >> "$GITHUB_ENV" + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + + - name: Extract OpenVINO packages + run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_TEST_DIR} + working-directory: ${{ env.INSTALL_TEST_DIR }} + + - name: Fetch setup_python and install wheels actions + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + timeout-minutes: 15 + with: + sparse-checkout: | + .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml + sparse-checkout-cone-mode: false + path: 'action_root' + + - name: Setup Python ${{ inputs.python-version }} + uses: ./action_root/.github/actions/setup_python + with: + version: ${{ inputs.python-version }} + pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} + should-setup-pip-paths: ${{ runner.os == 'Linux' }} + self-hosted-runner: ${{ runner.os == 'Linux' }} + + # + # Tests + # + - name: Install OpenVINO Python wheels + uses: ./action_root/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' + + - name: Install Python API tests dependencies + run: python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt + + # + # Tests + # + + - name: Python API Tests + if: fromJSON(inputs.affected-components).Python_API.test + run: | + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \ + --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ + --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py + + - name: Python API Tests -- numpy<2.0.0 + if: fromJSON(inputs.affected-components).Python_API.test + run: | + python3 -m pip uninstall -y numpy + python3 -m pip install "numpy~=1.26.0" + python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \ + --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph_new_numpy.xml \ + --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py + + - name: Clone API snippets + if: ${{ runner.os != 'macOS' && fromJSON(inputs.affected-components).docs_snippets.test }} + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + timeout-minutes: 15 + with: + sparse-checkout: docs/articles_en/assets/snippets + path: ${{ env.OPENVINO_REPO }} + submodules: 'false' + + - name: Docs Python snippets + if: ${{ runner.os != 'macOS' && fromJSON(inputs.affected-components).docs_snippets.test }} + run: | + # torch, onnx + python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/python/preprocess/torchvision/requirements.txt -r ${INSTALL_TEST_DIR}/tests/requirements_onnx + # to find 'snippets' module in docs + export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py + + - name: Upload Test Results + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + if: ${{ !cancelled() }} + with: + name: test-results-python-api-${{ inputs.python-version }} + path: | + ${{ env.INSTALL_TEST_DIR }}/TEST*.html + ${{ env.INSTALL_TEST_DIR }}/TEST*.xml + if-no-files-found: 'warn' diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml index 8075f3299fe063..b04f719c8e296f 100644 --- a/.github/workflows/job_python_unit_tests.yml +++ b/.github/workflows/job_python_unit_tests.yml @@ -65,21 +65,23 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV" - name: Install OpenVINO dependencies (mac) if: runner.os == 'macOS' run: brew install pigz - name: Extract OpenVINO packages - run: | - pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} + run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - - name: Fetch setup_python action + - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml sparse-checkout-cone-mode: false path: 'action_root' @@ -92,11 +94,10 @@ jobs: self-hosted-runner: ${{ runner.os == 'Linux' }} - name: Install OpenVINO Python wheels - run: | - # Install the core OV wheel - python3 -m pip install ./openvino-*.whl - - working-directory: ${{ env.INSTALL_WHEELS_DIR }} + uses: ./action_root/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' - name: Install Python API tests dependencies run: | @@ -121,15 +122,6 @@ jobs: # Tests # - - name: Python API Tests - if: ${{ fromJSON(inputs.affected-components).Python_API.test }} - run: | - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \ - --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ - --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py - - name: Python ONNX operators tests if: (fromJSON(inputs.affected-components).Python_API.test || fromJSON(inputs.affected-components).ONNX_FE.test) && @@ -185,35 +177,6 @@ jobs: TEST_DEVICE: CPU TEST_PRECISION: FP16 - - name: Clone API snippets - if: runner.os != 'macOS' - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - sparse-checkout: docs/articles_en/assets/snippets - path: ${{ env.OPENVINO_REPO }} - submodules: 'false' - - - name: Docs Python snippets - if: runner.os != 'macOS' - run: | - # to find 'snippets' module in docs - export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py - - - name: Python API Tests -- numpy>=2.0.0 - if: ${{ fromJSON(inputs.affected-components).Python_API.test }} - run: | - python3 -m pip uninstall -y numpy - python3 -m pip install "numpy>=2.0.0,<2.2.0" - python3 -m pip install -r ${INSTALL_TEST_DIR}/bindings/python/requirements_test.txt - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \ - --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ - --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py - - name: Upload Test Results uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 if: ${{ !cancelled() }} diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml index 271b7948d435dc..9a9abaf72ade62 100644 --- a/.github/workflows/job_pytorch_layer_tests.yml +++ b/.github/workflows/job_pytorch_layer_tests.yml @@ -86,6 +86,7 @@ jobs: - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_pytorch_models_tests.yml b/.github/workflows/job_pytorch_models_tests.yml index d52b819981d821..af304b18a5688f 100644 --- a/.github/workflows/job_pytorch_models_tests.yml +++ b/.github/workflows/job_pytorch_models_tests.yml @@ -78,6 +78,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_samples_tests.yml b/.github/workflows/job_samples_tests.yml index e144aa0cfb95aa..07fc17b797592e 100644 --- a/.github/workflows/job_samples_tests.yml +++ b/.github/workflows/job_samples_tests.yml @@ -54,6 +54,7 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "BUILD_DIR=$GITHUB_WORKSPACE/build" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV" - name: Install OpenVINO dependencies (mac) if: runner.os == 'macOS' @@ -65,13 +66,13 @@ jobs: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - - name: Fetch setup_python action - # Python is already installed on Ubuntu within Dockerfile - if: runner.os != 'Linux' + - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml sparse-checkout-cone-mode: false path: 'openvino' @@ -113,6 +114,12 @@ jobs: # Tests # + - name: Install OpenVINO Python wheels + uses: ./openvino/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' + - name: Samples tests if: fromJSON(inputs.affected-components).samples.test run: | @@ -122,7 +129,7 @@ jobs: export SHARE=$INSTALL_TEST_DIR/smoke_tests/samples_smoke_tests_data # Install Python benchmark_app by installing openvino-*.whl - python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt $INSTALL_WHEELS_DIR/openvino-*.whl + python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt export LD_LIBRARY_PATH=${IE_APP_PATH}:$LD_LIBRARY_PATH source ${INSTALL_DIR}/setupvars.sh diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml index 98f385e990f5e6..fb905f8ec4820b 100644 --- a/.github/workflows/job_tensorflow_layer_tests.yml +++ b/.github/workflows/job_tensorflow_layer_tests.yml @@ -86,6 +86,7 @@ jobs: - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_tensorflow_models_tests.yml b/.github/workflows/job_tensorflow_models_tests.yml index 5321beb8703de1..de5cf95484256a 100644 --- a/.github/workflows/job_tensorflow_models_tests.yml +++ b/.github/workflows/job_tensorflow_models_tests.yml @@ -70,6 +70,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml index 1068ec550d1752..89d572885b1abe 100644 --- a/.github/workflows/job_tokenizers.yml +++ b/.github/workflows/job_tokenizers.yml @@ -58,6 +58,7 @@ jobs: - name: checkout actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python @@ -79,6 +80,7 @@ jobs: - name: Clone OpenVINO Tokenizers uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_tokenizers' path: ${{ env.OPENVINO_TOKENIZERS_REPO }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 00f3a321e0dd1f..063b920eed80e9 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -27,6 +27,7 @@ jobs: steps: - name: Checkout Labeller Script uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: '.github' diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 10de6867c7d0e2..66e825e5d5e126 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -29,6 +29,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -53,7 +54,7 @@ jobs: Docker: needs: Smart_CI if: "!needs.smart_ci.outputs.skip_workflow" - runs-on: aks-linux-16-cores-arm-docker-build + runs-on: aks-linux-4-cores-8gb-arm-docker-build container: image: openvinogithubactions.azurecr.io/docker_build:0.2 volumes: @@ -63,6 +64,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -78,7 +80,7 @@ jobs: needs: [ Docker, Smart_CI ] uses: ./.github/workflows/job_build_linux.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} event-name: ${{ github.event_name }} @@ -97,6 +99,7 @@ jobs: -DENABLE_SYSTEM_OPENCL=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCPACK_GENERATOR=TGZ \ + -DENABLE_WHEEL=ON \ -DCMAKE_COMPILE_WARNING_AS_ERROR=ON Debian_Packages: @@ -105,7 +108,7 @@ jobs: if: ${{ 'false' }} uses: ./.github/workflows/job_debian_packages.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-4-cores-8gb-arm' image: 'openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04' Samples: @@ -113,7 +116,7 @@ jobs: if: fromJSON(needs.smart_ci.outputs.affected_components).samples uses: ./.github/workflows/job_samples_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} @@ -123,7 +126,7 @@ jobs: if: fromJSON(needs.smart_ci.outputs.affected_components).JS_API uses: ./.github/workflows/job_openvino_js.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-4-cores-8gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}"}' ONNX_Runtime: @@ -133,7 +136,7 @@ jobs: needs: [ Build, Smart_CI, Docker ] uses: ./.github/workflows/job_onnx_runtime.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' sccache-azure-key-prefix: 'ubuntu20_aarch64_onnxruntime' @@ -142,7 +145,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_tokenizers.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} @@ -154,7 +157,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_cxx_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }} affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'ubuntu_20_04' @@ -163,6 +166,16 @@ jobs: name: Python unit tests needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_python_unit_tests.yml + with: + runner: 'aks-linux-8-cores-16gb-arm' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' + + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml with: runner: 'aks-linux-16-cores-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' @@ -174,7 +187,7 @@ jobs: needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ] uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' @@ -184,7 +197,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' @@ -195,6 +208,8 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_cpu_functional_tests.yml with: + # Additional investigation needed why CPU functional tests are failing on v6 VM size's version, + # so leave it as it is for now runner: 'aks-linux-16-cores-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }} python-version: '3.11' @@ -207,7 +222,7 @@ jobs: needs: [ Build, Docker, Smart_CI, Openvino_tokenizers] uses: ./.github/workflows/job_tensorflow_models_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}"}' model_scope: 'precommit' @@ -218,7 +233,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_models_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}"}' model_scope: 'precommit' diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml index 27f54da6ecdc60..f198e64f7ad2ed 100644 --- a/.github/workflows/linux_conditional_compilation.yml +++ b/.github/workflows/linux_conditional_compilation.yml @@ -30,6 +30,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -64,6 +65,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -110,12 +112,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' - name: Clone test models uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/testdata' path: ${{ env.MODELS_PATH }} @@ -212,7 +216,6 @@ jobs: tar -cvf - \ tests/ov_cpu_func_tests \ tests/libopenvino_template_extension.so \ - tests/libze_loader.so* \ tests/libhwloc* \ tests/libtbb* \ tests/functional_test_utils/layer_tests_summary/* \ @@ -283,12 +286,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' - name: Clone test models uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/testdata' path: ${{ env.MODELS_PATH }} diff --git a/.github/workflows/linux_riscv.yml b/.github/workflows/linux_riscv.yml index 85b0db8c36294e..8966a63f611d36 100644 --- a/.github/workflows/linux_riscv.yml +++ b/.github/workflows/linux_riscv.yml @@ -29,6 +29,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -64,6 +65,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -103,6 +105,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' diff --git a/.github/workflows/linux_sanitizers.yml b/.github/workflows/linux_sanitizers.yml index 4bb597d83fadc8..cf8e1642fa5f51 100644 --- a/.github/workflows/linux_sanitizers.yml +++ b/.github/workflows/linux_sanitizers.yml @@ -25,6 +25,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -53,6 +54,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -108,12 +110,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' - name: Clone OpenVINO Contrib uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} @@ -281,6 +285,7 @@ jobs: - name: Fetch Sanitizer Suppression Lists uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | tests/sanitizers/lsan/suppressions.txt diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index c587c5ad7323b3..5e4335b8151c02 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -43,6 +43,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -83,12 +84,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO Contrib uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_contrib' path: 'openvino_contrib' @@ -151,6 +154,7 @@ jobs: -DENABLE_CPPLINT=OFF \ -DENABLE_NCC_STYLE=OFF \ -DENABLE_TESTS=ON \ + -DENABLE_WHEEL=OFF \ -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ -DENABLE_STRICT_DEPENDENCIES=OFF \ -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \ @@ -168,7 +172,6 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | @@ -179,6 +182,48 @@ jobs: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd + # Setup additional Python versions for wheels building + - name: Setup Python 3.9 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.9" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.10 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.10" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.12 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.12" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Build additional Python wheels + run: | + for py_version in "3.9" "3.10" "3.11" "3.12" + do + python_exec_path=$(python$py_version -c "import sys; print(sys.executable)") + $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt + + cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version + cmake --build ${{ github.workspace }}/py$py_version --parallel + cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels + done + + # Setup Python 3.11 as the default one + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: ./openvino/.github/actions/setup_python + with: + version: ${{ env.PYTHON_VERSION }} + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + - name: Cmake & Build - OpenVINO Contrib run: | cmake \ @@ -199,6 +244,7 @@ jobs: cmake --build ${{ env.BUILD_DIR }} --parallel $(nproc) cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR_JS }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake + # # Upload build artifacts # @@ -210,7 +256,7 @@ jobs: name: openvino_package path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz if-no-files-found: 'error' - + - name: Upload openvino wheels uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: @@ -270,6 +316,19 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'mac_13' + Python_API_Tests: + name: Python API tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + python-version: [ '3.9', '3.10', '3.11', '3.12' ] + with: + runner: 'macos-13' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: ${{ matrix.python-version }} + Python_Unit_Tests: name: Python unit tests needs: [ Build, Smart_CI ] diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 0708a844fe6b8b..855d76973cc2e4 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -43,6 +43,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -83,12 +84,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' submodules: 'true' - name: Clone OpenVINO Contrib uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_contrib' path: 'openvino_contrib' @@ -151,6 +154,7 @@ jobs: -DENABLE_CPPLINT=OFF \ -DENABLE_NCC_STYLE=OFF \ -DENABLE_TESTS=ON \ + -DENABLE_WHEEL=OFF \ -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ -DENABLE_STRICT_DEPENDENCIES=OFF \ -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \ @@ -168,7 +172,6 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | @@ -180,6 +183,48 @@ jobs: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd + # Setup additional Python versions for wheels building + - name: Setup Python 3.9 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.9" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.10 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.10" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.12 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.12" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Build additional Python wheels + run: | + for py_version in "3.9" "3.10" "3.11" "3.12" + do + python_exec_path=$(python$py_version -c "import sys; print(sys.executable)") + $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt + + cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version + cmake --build ${{ github.workspace }}/py$py_version --parallel + cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels + done + + # Setup Python 3.11 as the default one + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: ./openvino/.github/actions/setup_python + with: + version: ${{ env.PYTHON_VERSION }} + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + - name: Cmake & Build - OpenVINO Contrib run: | cmake \ @@ -279,6 +324,19 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + python-version: [ '3.9', '3.10', '3.11', '3.12' ] + with: + runner: 'macos-13-xlarge' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: ${{ matrix.python-version }} + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Build, Smart_CI, Openvino_tokenizers ] diff --git a/.github/workflows/manylinux_2014.yml b/.github/workflows/manylinux_2014.yml index bd5da965226a50..d6b3daa12abb57 100644 --- a/.github/workflows/manylinux_2014.yml +++ b/.github/workflows/manylinux_2014.yml @@ -28,6 +28,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -62,6 +63,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -69,6 +71,11 @@ jobs: images: | ov_build/ubuntu_22_04_x64_docker ov_build/manylinux2014_x86_64 + ov_test/ubuntu_20_04_x64_py313 + ov_test/ubuntu_22_04_x64 + ov_test/ubuntu_24_04_x64 + ov_test/fedora_33 + ov_test/debian_10_py310 registry: 'openvinogithubactions.azurecr.io' dockerfiles_root_dir: '.github/dockerfiles' changed_components: ${{ needs.smart_ci.outputs.changed_components }} @@ -92,6 +99,7 @@ jobs: OPENVINO_REPO: ${{ github.workspace }}/src INSTALL_DIR: ${{ github.workspace }}/install/openvino INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels + INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests BUILD_DIR: ${{ github.workspace }}/build DOCKER_CONFIG: "/mount/.docker" CMAKE_CXX_COMPILER_LAUNCHER: sccache @@ -107,6 +115,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: ${{ env.OPENVINO_REPO }} submodules: 'true' @@ -135,6 +144,7 @@ jobs: -v ${{ env.OPENVINO_REPO }}:/work/src \ -v ov_build_cache:/work/build \ -v ${{ env.INSTALL_DIR }}:/work/install \ + -v ${{ env.INSTALL_TEST_DIR }}:/work/api_tests \ -e SCCACHE_AZURE_BLOB_CONTAINER \ -e SCCACHE_AZURE_CONNECTION_STRING \ -e SCCACHE_SERVER_PORT \ @@ -148,16 +158,18 @@ jobs: -w /work/src \ ${{ fromJSON(needs.docker.outputs.images).ov_build.manylinux2014_x86_64 }} \ /bin/bash -c " - cmake -DENABLE_CPPLINT=OFF -DENABLE_NCC_STYLE=OFF -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_PYTHON=OFF -DENABLE_WHEEL=OFF -S /work/src -B /work/build && + python3.12 -m pip install -r /work/src/src/bindings/python/wheel/requirements-dev.txt + cmake -DPython3_EXECUTABLE=/usr/local/bin/python3.12 -DENABLE_CPPLINT=OFF -DENABLE_NCC_STYLE=OFF -DENABLE_TESTS=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_OV_TF_FRONTEND=OFF -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF -DENABLE_OV_PYTORCH_FRONTEND=ON -DENABLE_OV_JAX_FRONTEND=OFF -DENABLE_OV_ONNX_FRONTEND=ON -DENABLE_PYTHON=ON -DENABLE_WHEEL=ON -S /work/src -B /work/build && cmake --build /work/build --parallel $(nproc) --config ${{ env.CMAKE_BUILD_TYPE }} && cmake --install /work/build --config ${{ env.CMAKE_BUILD_TYPE }} --prefix /work/install + cmake --install /work/build --config ${{ env.CMAKE_BUILD_TYPE }} --prefix /work/api_tests --component tests " - name: Pack Artifacts run: mkdir -p ${{ env.BUILD_DIR }} && tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_package.tar.gz working-directory: ${{ env.INSTALL_DIR }} - - name: Build Python API(Python 3.9-3.13) + - name: Build Python API (Python 3.9-3.13) run: | SUPPORTED_PYTHON_VERSIONS=("39" "310" "311" "312" "313") for PY_VER in "${SUPPORTED_PYTHON_VERSIONS[@]}"; do @@ -190,6 +202,10 @@ jobs: " done + - name: Pack openvino_tests + run: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz + working-directory: ${{ env.INSTALL_TEST_DIR }} + # # Upload build artifacts # @@ -208,7 +224,15 @@ jobs: name: openvino_wheels path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl if-no-files-found: 'error' - + + - name: Upload openvino tests package + if: ${{ always() }} + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: openvino_tests + path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz + if-no-files-found: 'error' + - name: Store artifacts to a shared drive id: store_artifacts if: ${{ always() }} @@ -220,10 +244,34 @@ jobs: ${{ env.INSTALL_WHEELS_DIR }}/wheels storage_dir: ${{ env.PRODUCT_TYPE }} storage_root: ${{ env.ARTIFACTS_SHARE }} - + + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + include: + - python-version: "3.9" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.fedora_33 }} + - python-version: "3.10" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_py310 }} + - python-version: "3.11" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }} + - python-version: "3.12" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }} + - python-version: "3.13" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_x64_py313 }} + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ matrix.image }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: ${{ matrix.python-version }} + Overall_Status: name: ci/gha_overall_status_manylinux2014 - needs: [Smart_CI, Build] + needs: [Smart_CI, Build, Python_API_Tests] if: ${{ always() }} runs-on: ubuntu-latest steps: diff --git a/.github/workflows/ovc.yml b/.github/workflows/ovc.yml index 4d69563a741d3a..3e7dedf50ad51b 100644 --- a/.github/workflows/ovc.yml +++ b/.github/workflows/ovc.yml @@ -20,6 +20,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - name: Setup Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 diff --git a/.github/workflows/py_checks.yml b/.github/workflows/py_checks.yml index caed37eee89056..dcf0932df8024e 100644 --- a/.github/workflows/py_checks.yml +++ b/.github/workflows/py_checks.yml @@ -29,6 +29,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - name: Setup Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 diff --git a/.github/workflows/ubuntu_20.yml b/.github/workflows/ubuntu_20.yml index 63a1fab87d566f..19760ff2551773 100644 --- a/.github/workflows/ubuntu_20.yml +++ b/.github/workflows/ubuntu_20.yml @@ -31,6 +31,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -65,6 +66,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -98,6 +100,7 @@ jobs: -DENABLE_SYSTEM_OPENCL=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCPACK_GENERATOR=TGZ \ + -DENABLE_WHEEL=ON \ -DCMAKE_COMPILE_WARNING_AS_ERROR=ON CXX_Unit_Tests: diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index f4caec8b2458a0..5aed74bbb242b8 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -33,6 +33,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -67,6 +68,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -101,6 +103,7 @@ jobs: -DENABLE_SYSTEM_OPENCL=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCPACK_GENERATOR=TGZ \ + -DENABLE_WHEEL=ON \ -DCMAKE_COMPILE_WARNING_AS_ERROR=ON Debian_Packages: @@ -184,6 +187,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -300,6 +304,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ] @@ -460,6 +474,7 @@ jobs: - name: Clone OpenVINO Contrib uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/openvino_contrib' path: ${{ env.OPENVINO_CONTRIB_REPO }} diff --git a/.github/workflows/ubuntu_22_dpcpp.yml b/.github/workflows/ubuntu_22_dpcpp.yml index 48230155f7e903..ad11a31f7403bf 100644 --- a/.github/workflows/ubuntu_22_dpcpp.yml +++ b/.github/workflows/ubuntu_22_dpcpp.yml @@ -21,6 +21,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -55,6 +56,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index d874e06a189232..25be095e692d35 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -28,6 +28,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -62,6 +63,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -134,6 +136,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.12' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.12' + Pytorch_Layer_Tests: name: Pytorch Layer Tests needs: [ Docker, Build, Smart_CI ] diff --git a/.github/workflows/webassembly.yml b/.github/workflows/webassembly.yml index 45d6c9ce98317a..350df3113b0f3a 100644 --- a/.github/workflows/webassembly.yml +++ b/.github/workflows/webassembly.yml @@ -25,6 +25,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -59,6 +60,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 - uses: ./.github/actions/handle_docker id: handle_docker @@ -92,6 +94,7 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' submodules: 'true' diff --git a/.github/workflows/windows_conditional_compilation.yml b/.github/workflows/windows_conditional_compilation.yml index 6a5846b514dbd7..0f965eabd3c1ad 100644 --- a/.github/workflows/windows_conditional_compilation.yml +++ b/.github/workflows/windows_conditional_compilation.yml @@ -31,6 +31,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -74,12 +75,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' submodules: 'true' - name: Clone test models uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/testdata' path: 'testdata' @@ -237,7 +240,7 @@ jobs: Compress-Archive @compress $compress = @{ - Path = "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/ov_cpu_func_tests.exe", "${{ env.BUILD_DIR }}/bin/${{ env.CMAKE_BUILD_TYPE }}/ze_loader.dll", "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/openvino_template_extension.dll", "${{ env.OPENVINO_REPO }}/src/tests/test_utils/functional_test_utils/layer_tests_summary", "${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb" + Path = "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/ov_cpu_func_tests.exe", "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/openvino_template_extension.dll", "${{ env.OPENVINO_REPO }}/src/tests/test_utils/functional_test_utils/layer_tests_summary", "${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb" CompressionLevel = "Optimal" DestinationPath = "${{ env.BUILD_DIR }}/openvino_tests.zip" } @@ -283,12 +286,14 @@ jobs: steps: - name: Clone OpenVINO uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: path: 'openvino' submodules: 'true' - name: Clone test models uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: repository: 'openvinotoolkit/testdata' path: 'testdata' @@ -370,6 +375,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/windows_vs2019_debug.yml b/.github/workflows/windows_vs2019_debug.yml index 68a99055f5bdb8..4fcdc6b58b79d1 100644 --- a/.github/workflows/windows_vs2019_debug.yml +++ b/.github/workflows/windows_vs2019_debug.yml @@ -27,6 +27,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index c42475fea9cd64..f1fd0be596baa2 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -29,6 +29,7 @@ jobs: steps: - name: checkout action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: .github/actions/smart-ci @@ -63,6 +64,7 @@ jobs: cmake-options: |- -G "Ninja Multi-Config" ` -DENABLE_PYTHON=ON ` + -DENABLE_WHEEL=ON ` -DENABLE_CPPLINT=OFF ` -DBUILD_SHARED_LIBS=ON ` -DENABLE_TESTS=ON ` @@ -111,6 +113,7 @@ jobs: - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -184,6 +187,7 @@ jobs: steps: - name: Fetch OpenVINO JS sources uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | src/bindings/js @@ -282,6 +286,7 @@ jobs: - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -424,6 +429,7 @@ jobs: - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml @@ -543,6 +549,7 @@ jobs: - name: Fetch setup_python action uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: | .github/actions/setup_python/action.yml diff --git a/.github/workflows/workflow_rerunner.yml b/.github/workflows/workflow_rerunner.yml index 55ecc2500635b1..0d8d6610bea588 100644 --- a/.github/workflows/workflow_rerunner.yml +++ b/.github/workflows/workflow_rerunner.yml @@ -38,6 +38,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: '.github/scripts/workflow_rerun' @@ -73,6 +74,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 with: sparse-checkout: '.github/scripts/workflow_rerun' lfs: true diff --git a/.github/workflows/workflows_scans.yml b/.github/workflows/workflows_scans.yml new file mode 100644 index 00000000000000..1a3d091544e784 --- /dev/null +++ b/.github/workflows/workflows_scans.yml @@ -0,0 +1,39 @@ +name: GitHub Actions Workflows Scans +on: + workflow_dispatch: {} + push: + paths: + - '.github/workflows/**' + branches: + - 'master' + - 'releases/**' + pull_request: + paths: + - '.github/workflows/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: read-all + +jobs: + semgrep: + name: github_actions_workflows_scan/semgrep + runs-on: ubuntu-latest + if: ${{ github.repository_owner == 'openvinotoolkit' }} + + container: + image: semgrep/semgrep + + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 + with: + submodules: 'false' + sparse-checkout: .github/workflows + + - name: Semgrep scan + run: | + semgrep scan --error -j 8 --config "p/github-actions" .github/workflows/* diff --git a/.github/workflows/workflows_to_track.txt b/.github/workflows/workflows_to_track.txt new file mode 100644 index 00000000000000..ef3bb633ed7737 --- /dev/null +++ b/.github/workflows/workflows_to_track.txt @@ -0,0 +1,75 @@ +==> ./stale_prs_and_issues.yml <== +name: 'Close stale issues and PRs' +==> ./build_doc.yml <== +name: Documentation +==> ./ovc.yml <== +name: OVC +==> ./ubuntu_22.yml <== +name: Linux (Ubuntu 22.04, Python 3.11) +==> ./windows_conditional_compilation.yml <== +name: Windows Conditional Compilation (VS 2022, Python 3.11) +==> ./send_workflows_to_opentelemetry.yml <== +name: Export workflow metrics (BETA) +==> ./ubuntu_22_dpcpp.yml <== +name: Linux (Ubuntu 22.04, Python 3.11, Intel DPC++ Compiler) +==> ./coverage.yml <== +name: Code coverage +==> ./linux_conditional_compilation.yml <== +name: Linux Static CC (Ubuntu 22.04, Python 3.11, Clang) +==> ./workflows_scans.yml <== +name: GitHub Actions Workflows Scans +==> ./check_pr_commits.yml <== +name: PR Commits +==> ./windows_vs2019_debug.yml <== +name: Windows (VS 2019, Python 3.11, Debug) +==> ./files_size.yml <== +name: Files Size +==> ./cleanup_caches.yml <== +name: Cleanup caches +==> ./mac.yml <== +name: macOS (Python 3.11) +==> ./merge_queue_stub.yml <== +==> ./debian_10_arm.yml <== +name: Debian 10 ARM +==> ./android_arm64.yml <== +name: Android ARM64 with vcpkg +==> ./code_style.yml <== +name: Code Style +==> ./manylinux_2014.yml <== +name: Manylinux 2014 +==> ./linux_arm64.yml <== +name: Linux ARM64 (Ubuntu 20.04, Python 3.11) +==> ./dev_cpu_linux_snippets_libxsmm.yml <== +name: Linux CPU Plugin Snippets with LIBXSMM (Ubuntu 20.04) +==> ./labeler.yml <== +name: "Pull Request Labeler" +==> ./mac_arm64.yml <== +name: macOS ARM64 (Python 3.11) +==> ./dependency_review.yml <== +name: 'Dependency Review' +==> ./fedora_29.yml <== +name: Fedora 29 (RHEL 8.4), Python 3.9 +==> ./code_snippets.yml <== +name: Code snippets +==> ./ubuntu_20.yml <== +name: Linux (Ubuntu 20.04, Python 3.9) +==> ./linux_riscv.yml <== +name: Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10) +==> ./android_x64.yml <== +name: Android x64 +==> ./workflow_rerunner.yml <== +name: Rerun Workflow with Known Errors +==> ./linux_sanitizers.yml <== +name: Linux Sanitizers (Ubuntu 20.04, Python 3.9) +==> ./py_checks.yml <== +name: Python API Checks +==> ./webassembly.yml <== +name: Webassembly +==> ./ubuntu_24.yml <== +name: Linux (Ubuntu 24.04, Python 3.12) +==> ./assign_issue.yml <== +name: Take Issue +==> ./windows_vs2019_release.yml <== +name: Windows (VS 2019, Python 3.11, Release) +==> ./coverity.yml <== +name: Coverity (Ubuntu 20.04, Python 3.11) diff --git a/cmake/developer_package/compile_flags/os_flags.cmake b/cmake/developer_package/compile_flags/os_flags.cmake index fdfd7211c8e815..660fd6160893ae 100644 --- a/cmake/developer_package/compile_flags/os_flags.cmake +++ b/cmake/developer_package/compile_flags/os_flags.cmake @@ -4,6 +4,7 @@ include(ProcessorCount) include(CheckCXXCompilerFlag) +include(CheckCXXSourceCompiles) # # ov_disable_deprecated_warnings() @@ -91,6 +92,50 @@ macro(ov_dev_package_no_errors) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ov_c_cxx_dev_no_errors}") endmacro() +# +# ov_check_compiler_supports_sve(flags) +# +# Checks whether CXX compiler for passed language supports SVE code compilation +# +macro(ov_check_compiler_supports_sve flags) + # Code to compile + set(SVE_CODE " + #include + int main() { + svfloat64_t a; + a = svdup_n_f64(0); + return 0; + }") + + # Save the current state of required flags + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + + # Set the flags necessary for compiling the test code with SVE support + set(CMAKE_REQUIRED_FLAGS "${CMAKE_CXX_FLAGS_INIT} ${flags}") + + # Check if the source code compiles with the given flags for C++ + CHECK_CXX_SOURCE_COMPILES("${SVE_CODE}" CXX_HAS_SVE) + + # If the compilation test is successful, set appropriate variables indicating support + if(CXX_HAS_SVE) + set(CXX_SVE_FOUND TRUE CACHE BOOL "SVE available on host") + set(CXX_SVE_FOUND TRUE CACHE BOOL "CXX SVE support") + set(CXX_SVE_FLAGS "${flags}" CACHE STRING "CXX SVE flags") + endif() + + # Restore the original state of required flags + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + # If the compilation test fails, indicate that the support is not found + if(NOT CXX_SVE_FOUND) + set(CXX_SVE_FOUND FALSE CACHE BOOL "CXX SVE support") + set(CXX_SVE_FLAGS "" CACHE STRING "CXX SVE flags") + endif() + + # Mark the variables as advanced to hide them in the default CMake GUI + mark_as_advanced(CXX_SVE_FOUND CXX_SVE_FLAGS) +endmacro() + # # ov_sse42_optimization_flags() # @@ -208,6 +253,49 @@ macro(ov_arm_neon_fp16_optimization_flags flags) endif() endmacro() +# +# ov_arm_sve_optimization_flags() +# +macro(ov_arm_sve_optimization_flags flags) + # Check for compiler SVE support + ov_check_compiler_supports_sve("-march=armv8-a+sve") + + if(OV_COMPILER_IS_INTEL_LLVM) + message(WARNING "Unsupported CXX compiler ${CMAKE_CXX_COMPILER_ID}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # nothing should be required here + elseif(ANDROID) + if(ANDROID_ABI STREQUAL "arm64-v8a") + set(${flags} -Wno-unused-command-line-argument) + if(CXX_SVE_FOUND) + list(APPEND ${flags} -march=armv8-a+sve) + else() + message(WARNING "SVE is not supported on this Android ABI: ${ANDROID_ABI}") + endif() + else() + message(WARNING "SVE is not supported on this Android ABI: ${ANDROID_ABI}") + endif() + else() + if(AARCH64) + set(${flags} -O2) + + # Add flag for SVE if supported + if(CXX_SVE_FOUND) + list(APPEND ${flags} -march=armv8-a+sve) + endif() + if(NOT CMAKE_CL_64) + list(APPEND ${flags} -ftree-vectorize) + endif() + + set(${flags} ${${flags}}) + elseif(ARM) + message(WARNING "SVE is not supported on 32-bit ARM architectures.") + else() + message(WARNING "SVE is not supported by architecture ${CMAKE_SYSTEM_PROCESSOR}") + endif() + endif() +endmacro() + # # ov_disable_all_warnings() # diff --git a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake index c33d64635eb10b..fd534f3e600bfe 100644 --- a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake +++ b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake @@ -18,6 +18,7 @@ set(_CPU_CHECK_ANY "true") set(_CPU_CHECK_SSE42 "with_cpu_x86_sse42()") set(_CPU_CHECK_AVX "with_cpu_x86_avx()") set(_CPU_CHECK_NEON_FP16 "with_cpu_neon_fp16()") +set(_CPU_CHECK_SVE "with_cpu_sve()") set(_CPU_CHECK_AVX2 "with_cpu_x86_avx2()") set(_CPU_CHECK_AVX512F "with_cpu_x86_avx512f()") diff --git a/cmake/developer_package/cross_compile/cross_compiled_func.cmake b/cmake/developer_package/cross_compile/cross_compiled_func.cmake index 1e92fe3bfdaf8c..962aa5d373a4db 100644 --- a/cmake/developer_package/cross_compile/cross_compiled_func.cmake +++ b/cmake/developer_package/cross_compile/cross_compiled_func.cmake @@ -3,7 +3,7 @@ # ## list of available instruction sets -set(_ARCH_LIST ANY SSE42 AVX AVX2 AVX512F NEON_FP16) +set(_ARCH_LIST ANY SSE42 AVX AVX2 AVX512F NEON_FP16 SVE) set(_ACCEPTED_ARCHS_ANY "^(ANY)$") set(_ACCEPTED_ARCHS_SSE42 "^(ANY|SSE42)$") @@ -11,6 +11,7 @@ set(_ACCEPTED_ARCHS_AVX "^(ANY|SSE42|AVX)$") set(_ACCEPTED_ARCHS_AVX2 "^(ANY|SSE42|AVX|AVX2)$") set(_ACCEPTED_ARCHS_AVX512F "^(ANY|SSE42|AVX|AVX2|AVX512F)$") set(_ACCEPTED_ARCHS_NEON_FP16 "^(ANY|NEON_FP16)$") +set(_ACCEPTED_ARCHS_SVE "^(ANY|SVE)$") ## Arch specific definitions set(_DEFINE_ANY "") @@ -19,12 +20,14 @@ set(_DEFINE_AVX "HAVE_AVX" ${_DEFINE_SSE42}) set(_DEFINE_AVX2 "HAVE_AVX2" ${_DEFINE_AVX}) set(_DEFINE_AVX512F "HAVE_AVX512F" ${_DEFINE_AVX2}) set(_DEFINE_NEON_FP16 "HAVE_NEON_FP16" ${_DEFINE_ANY}) +set(_DEFINE_SVE "HAVE_SVE" ${_DEFINE_SVE}) ## Arch specific compile options ov_avx512_optimization_flags(_FLAGS_AVX512F) ov_avx2_optimization_flags (_FLAGS_AVX2) ov_sse42_optimization_flags (_FLAGS_SSE42) ov_arm_neon_fp16_optimization_flags(_FLAGS_NEON_FP16) +ov_arm_sve_optimization_flags(_FLAGS_SVE) set(_FLAGS_AVX "") ## TBD is not defined for OV project yet set(_FLAGS_ANY "") ## @@ -185,6 +188,8 @@ endfunction() function(_currently_requested_top_arch VAR) if(ENABLE_NEON_FP16) set(RES NEON_FP16) + elseif(ENABLE_SVE) + set(RES SVE) elseif(ENABLE_AVX512F) set(RES AVX512F) elseif(ENABLE_AVX2) diff --git a/cmake/developer_package/features.cmake b/cmake/developer_package/features.cmake index 8d1f3696c6759c..ae5313cea8a8b4 100644 --- a/cmake/developer_package/features.cmake +++ b/cmake/developer_package/features.cmake @@ -51,6 +51,8 @@ ov_dependent_option (ENABLE_AVX512F "Enable AVX512 optimizations" ON "X86_64 OR ov_dependent_option(ENABLE_NEON_FP16 "Enable ARM FP16 optimizations" ON "AARCH64" OFF) +ov_dependent_option(ENABLE_SVE "Enable SVE optimizations" ON "AARCH64" OFF) + # Type of build, we add this as an explicit option to default it to ON get_property(BUILD_SHARED_LIBS_DEFAULT GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS) ov_option (BUILD_SHARED_LIBS "Build as a shared library" ${BUILD_SHARED_LIBS_DEFAULT}) diff --git a/cmake/packaging/debian.cmake b/cmake/packaging/debian.cmake index 59b312963c180d..c82dca0364b463 100644 --- a/cmake/packaging/debian.cmake +++ b/cmake/packaging/debian.cmake @@ -99,6 +99,7 @@ macro(ov_cpack_settings) 2024.3.0 2024.4.0 2024.5.0 + 2024.6.0 ) ov_check_conflicts_versions(conflicting_versions) diff --git a/cmake/packaging/rpm.cmake b/cmake/packaging/rpm.cmake index a4a63c35858bf9..6e9d535d41cfff 100644 --- a/cmake/packaging/rpm.cmake +++ b/cmake/packaging/rpm.cmake @@ -87,6 +87,7 @@ macro(ov_cpack_settings) 2024.3.0 2024.4.0 2024.5.0 + 2024.6.0 ) ov_check_conflicts_versions(conflicting_versions) diff --git a/docs/RELEASE.MD b/docs/RELEASE.MD new file mode 100644 index 00000000000000..b345431f3f2bcf --- /dev/null +++ b/docs/RELEASE.MD @@ -0,0 +1,29 @@ +# OpenVINO Release Management +The process described below reflects the approach to managing OpenVINO releases. + +## Release Milestones +- Planning +- Execution (development of new features) +- Stabilization (Feature Freeze, Code Freeze milestones) +- Validation +- Distribution + +### Planning +This phase takes 2-4 weeks and involves scoping the backlog, prioritizing it, analyzing, and making commitments by developers for timelines specified by the release manager. + +### Execution (development of new features) +- [OpenVINO Contributing Guide](https://github.com/openvinotoolkit/openvino/blob/master/CONTRIBUTING.md) +- [Code Contribution Guide](https://docs.openvino.ai/2024/about-openvino/contributing/code-contribution-guide.html) +- [OpenVINO First Good Issue](https://github.com/openvinotoolkit/openvino/issues/17502) + +### Stabilization (Feature Freeze, Code Freeze milestones) +- **Feature Freeze**: This milestone ensures that no new features are added to the software after a certain point. This allows the development team to focus on stabilizing and refining the existing features, fixing bugs, and improving performance without the risk of introducing new issues. +- **Code Freeze**: This milestone marks the point where no new code changes are allowed except for critical bug fixes. This helps in ensuring that the final product is stable and reliable, as it minimizes the risk of last-minute changes that could introduce new bugs or instability. + +### Release Validation +- This is a continuous process executed on a regular basis with cadence based on testing type: nightly, bi-weekly, weekly. +- After Code Freeze, the testing team can perform final regression testing to ensure that recent changes have not introduced new bugs and that the software meets the required quality standards. + +### Distribution +- OpenVINO has different types of build distribution: Regular releases, Long-Term Support, Pre-release releases, Nightly builds. Read more here: [OpenVINO Release Policy](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/release-policy.html) +- Different distribution channels are supported. Explore different options here: [OpenVINO Download](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/download.html) diff --git a/docs/articles_en/about-openvino/additional-resources/glossary.rst b/docs/articles_en/about-openvino/additional-resources/glossary.rst index 9aba2b395525c2..6120b0c9018a54 100644 --- a/docs/articles_en/about-openvino/additional-resources/glossary.rst +++ b/docs/articles_en/about-openvino/additional-resources/glossary.rst @@ -38,7 +38,6 @@ Acronyms and Abbreviations LRN Local Response Normalization mAP Mean Average Precision Intel® OneDNN Intel® OneAPI Deep Neural Network Library - `mo` Command-line tool for model conversion, CLI for ``tools.mo.convert_model`` (legacy) MVN Mean Variance Normalization NCDHW Number of images, Channels, Depth, Height, Width NCHW Number of images, Channels, Height, Width diff --git a/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst b/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst index c80dc388568004..3bb46116ee1748 100644 --- a/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst +++ b/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst @@ -31,11 +31,6 @@ OpenVINO offers the option of running automated inference with the following inf | :doc:`Automatic Batching <../../openvino-workflow/running-inference/inference-devices-and-modes/automatic-batching>`: | automatically groups inference requests to improve device utilization. -| :doc:`(LEGACY) Multi-device Inference <./../../documentation/legacy-features/multi-device>`: -| executes inference on multiple devices. Currently, this mode is considered a legacy - solution. Using Automatic Device Selection instead is advised. - - Feature Support and API Coverage ################################# @@ -52,7 +47,6 @@ Feature Support and API Coverage :doc:`Preprocessing acceleration <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing>` Yes Yes No :doc:`Stateful models <../../openvino-workflow/running-inference/stateful-models>` Yes Yes Yes :doc:`Extensibility <../../documentation/openvino-extensibility>` Yes Yes No - :doc:`(LEGACY) Multi-device execution <./../../documentation/legacy-features/multi-device>` Yes Yes Partial ======================================================================================================================================== ======= ========== =========== @@ -83,7 +77,7 @@ For setting up a relevant configuration, refer to the :doc:`Integrate with Customer Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` topic (step 3 "Configure input and output"). -.. dropdown:: Device support across OpenVINO 2024.5 distributions +.. dropdown:: Device support across OpenVINO 2024.6 distributions =============== ========== ====== =============== ======== ============ ========== ========== ========== Device Archives PyPI APT/YUM/ZYPPER Conda Homebrew vcpkg Conan npm diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst index 8a58dc27df1f83..a408253038e75c 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks.rst @@ -56,7 +56,8 @@ implemented in your solutions. Click the buttons below to see the chosen benchma :material-regular:`table_view;1.4em` LLM performance for AI PC - .. grid-item:: +.. uncomment under + .. .. grid-item:: .. button-link:: # :class: ovms-toolkit-benchmark-llm-result @@ -64,7 +65,7 @@ implemented in your solutions. Click the buttons below to see the chosen benchma :outline: :expand: - :material-regular:`bar_chart;1.4em` OVMS for GenAI (coming soon) + :material-regular:`bar_chart;1.4em` OVMS for GenAI @@ -132,21 +133,21 @@ For a listing of all platforms and configurations used for testing, refer to the .. grid-item:: - .. button-link:: ../_static/benchmarks_files/OV-2024.5-platform_list.pdf + .. button-link:: ../_static/benchmarks_files/OV-2024.6-platform_list.pdf :color: primary :outline: :expand: :material-regular:`download;1.5em` Click for Hardware Platforms [PDF] - .. button-link:: ../_static/benchmarks_files/OV-2024.5-system-info-detailed.xlsx + .. button-link:: ../_static/benchmarks_files/OV-2024.6-system-info-detailed.xlsx :color: primary :outline: :expand: :material-regular:`download;1.5em` Click for Configuration Details [XLSX] - .. button-link:: ../_static/benchmarks_files/OV-2024.5-Performance-Data.xlsx + .. button-link:: ../_static/benchmarks_files/OV-2024.6-Performance-Data.xlsx :color: primary :outline: :expand: @@ -160,10 +161,10 @@ For a listing of all platforms and configurations used for testing, refer to the **Disclaimers** * Intel® Distribution of OpenVINO™ toolkit performance results are based on release - 2024.5, as of November 20, 2024. + 2024.6, as of December 18, 2024. * OpenVINO Model Server performance results are based on release - 2024.4, as of Sept. 30, 2024. + 2024.5, as of November 20, 2024. The results may not reflect all publicly available updates. Intel technologies' features and benefits depend on system configuration and may require enabled hardware, software, or service diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index 5697fcbf6e4d74..085a1ff8449151 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -3,12 +3,11 @@ Most Efficient Large Language Models for AI PC This page is regularly updated to help you identify the best-performing LLMs on the Intel® Core™ Ultra processor family and AI PCs. -The current data is as of OpenVINO 2024.4, 24 Oct. 2024 +The current data is as of OpenVINO 2024.4, 20 Nov. 2024. The tables below list the key performance indicators for inference on built-in GPUs. - .. raw:: html @@ -18,27 +17,30 @@ The tables below list the key performance indicators for inference on built-in G .. tab-item:: 9-288V - .. csv-table:: + .. data-table:: :class: modeldata stripe - :name: supportedModelsTableOv + :name: supportedModelsTable_V1 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_9-288V.csv + :hidden: [3,4,6] .. tab-item:: 7-268V - .. csv-table:: + .. data-table:: :class: modeldata stripe - :name: supportedModelsTableOv + :name: supportedModelsTable_V2 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_7-258V.csv + :hidden: [3,4,6] .. tab-item:: 7-155H - .. csv-table:: + .. data-table:: :class: modeldata stripe - :name: supportedModelsTableOv + :name: supportedModelsTable_V3 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_7-155H.csv + :hidden: [3,4,6] .. grid:: 1 1 2 2 diff --git a/docs/articles_en/about-openvino/performance-benchmarks/getting-performance-numbers.rst b/docs/articles_en/about-openvino/performance-benchmarks/getting-performance-numbers.rst index 936f1145a6b3b0..9ba82690b00395 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/getting-performance-numbers.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/getting-performance-numbers.rst @@ -103,7 +103,7 @@ General considerations Some image pre-processing can be baked into OpenVINO IR and accelerated accordingly. For more information, refer to - :doc:`Embedding Pre-processing <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation>` + :doc:`Preprocessing API <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/preprocessing-api-details>`. and :doc:`General Runtime Optimizations <../../openvino-workflow/running-inference/optimize-inference/general-optimizations>`. @@ -192,7 +192,7 @@ execution breakdown. For example, the table below is part of performance counters for :doc:`CPU inference <../../openvino-workflow/running-inference/inference-devices-and-modes/cpu-device>`. -of a `TensorFlow implementation of ResNet-50 `__ +of a TensorFlow implementation of ResNet-50. Keep in mind that since the device is CPU, the ``realTime`` wall clock and the ``cpu`` time layers are the same. Information about layer precision is also stored in the performance counters. diff --git a/docs/articles_en/about-openvino/performance-benchmarks/model-accuracy-int8-fp32.rst b/docs/articles_en/about-openvino/performance-benchmarks/model-accuracy-int8-fp32.rst index e87733a1445356..78c947fb64cb07 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/model-accuracy-int8-fp32.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/model-accuracy-int8-fp32.rst @@ -1,10 +1,12 @@ Model Accuracy ============== -between OV-accuracy and the original framework accuracy for FP32, and the same for INT8, BF16, -and FP16 representations of a model on three platform architectures. The third table presents -the GenAI model accuracies as absolute accuracy values. Refer to notes below the table for more -information. + + +The following two tables present the absolute accuracy drop calculated as the accuracy difference +between OV-accuracy and the original frame work accuracy for FP32, and the same for INT8, BF16 and +FP16 representations of a model on three platform architectures. The third table presents the GenAI model accuracies as absolute accuracy values. Please also refer to notes below +the table for more information. * A - Intel® Core™ i9-9000K (AVX2), INT8 and FP32 * B - Intel® Xeon® 6338, (VNNI), INT8 and FP32 @@ -32,31 +34,31 @@ information. * - efficientdet-d0 - COCO2017_detection_91cl - coco_precision - - -0.84% - - -0.59% + - - - -0.59% + - - -0.55% * - mask_rcnn_resnet50_atrous_coco - COCO2017_detection_91cl_bkgr - coco_orig_precision - -0.10% - -0.04% - - 0.07% + - - -0.01% * - mobilenet-v2 - ImageNet2012 - accuracy @ top1 - - + - - -0.97% - -0.98% - -0.95% * - resnet-50 - ImageNet2012 - accuracy @ top1 - - 0.74% - - 0.76% - - 0.74% - - 0.82% + - + - 0.97% + - 0.94% + - 0.95% * - ssd-resnet34-1200 - COCO2017_detection_80cl_bkgr - map @@ -67,18 +69,17 @@ information. * - ssd-mobilenet-v1-coco - COCO2017_detection_80cl_bkgr - coco-precision - - -2.94% - - -0.28% + - - -0.28% + - - -0.26% * - yolo_v8n - COCO2017_detection_80cl - map - - -0.01% - - -0.04% - - -0.07% - - 0.05% - + - -0.11% + - -0.05% + - + - .. list-table:: Model Accuracy for BF16, FP32 and FP16 (FP16: Flex-170 only. BF16: Xeon(R) 8480+ only) :header-rows: 1 @@ -101,15 +102,15 @@ information. * - efficientdet-d0 - COCO2017_detection_91cl - coco_precision - - 0.01% - - 0.01% + - - 0.01% - 0.00% + - 0.01% - 0.00% * - mask_rcnn_resnet50_atrous_coco - COCO2017_detection_91cl_bkgr - coco_orig_precision - - -0.01% + - - -0.01% - -0.01% - 0.05% @@ -135,8 +136,8 @@ information. - map - 0.02% - 0.02% + - 0.01% - 0.02% - - -0.01% - 0.02% * - ssd-mobilenet-v1-coco - COCO2017_detection_80cl_bkgr @@ -154,10 +155,9 @@ information. - 0.01% - 0.05% - 0.00% - .. list-table:: Model Accuracy for VNNI-FP16, VNNI-INT4, AMX-FP16 and MTL-INT4 (Core Ultra iGPU) :header-rows: 1 - + * - OpenVINO™ Model name - dataset - Metric Name @@ -168,59 +168,59 @@ information. * - chatGLM4 - Wikiset - ppl - - - - - - - - + - + - + - + - * - Gemma-2-9B - Wikitext - ppl - - + - - 1.57 - 1.57 - - + - * - Llama-2-7b-chat - Wikiset - ppl - - - - + - + - 1.59 - 1.59 - - + - * - Llama-3-8b - Wikiset - ppl - 1.45 - 1.48 - 1.45 - - + - * - Llama-3.2-3b-instruct - Wikiset - ppl - 1.60 - 1.62 - - 1.17 - - + - 1.62 + - * - Mistral-7b - Wikitext - ppl - 1.48 - 1.49 - 1.48 - - + - * - Phi3-mini-4k-instruct - Wikitext - ppl - - 1.52 - 1.55 - - 1.52 - - 1.56 + - 1.55 + - 1.55 + - * - Qwen-2-7B - Wikitext - ppl - 1.52 - 1.53 - 1.52 - - 1.56 + - Notes: For all accuracy metrics a "-", (minus sign), indicates an accuracy drop. For perplexity (ppl) the values do not indicate a deviation from a reference but are the actual measured diff --git a/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst b/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst index 0f70c93e9c8b96..5495711bc0054a 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst @@ -15,13 +15,7 @@ Performance Information F.A.Q. .. dropdown:: Where can I find the models used in the performance benchmarks? - All models used are included in the GitHub repository of - :doc:`Open Model Zoo <../../documentation/legacy-features/model-zoo>`. - - .. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. + All models used are published on `Hugging Face `__. .. dropdown:: Will there be any new models added to the list used for benchmarking? @@ -35,7 +29,7 @@ Performance Information F.A.Q. open-source tool within the Intel® Distribution of OpenVINO™ toolkit called :doc:`benchmark_app <../../learn-openvino/openvino-samples/benchmark-tool>`. - For diffusers (Stable-Diffusion) and foundational models (aka LLMs) please use the OpenVINO GenAI + For diffusers (Stable-Diffusion) and foundational models (aka LLMs) please use the OpenVINO GenAI opensource repo `OpenVINO GenAI tools/llm_bench `__ For a simple instruction on testing performance, see the :doc:`Getting Performance Numbers Guide `. @@ -93,30 +87,6 @@ Performance Information F.A.Q. - BERT - question / answer - 128 - * - `efficientdet-d0 `__ - - Efficientdet - - classification - - 512x512 - * - `mask_rcnn_resnet50_atrous_coco `__ - - Mask R-CNN ResNet 50 Atrous - - object instance segmentation - - 800x1365 - * - `mobilenet-v2 `__ - - Mobilenet V2 PyTorch - - classification - - 224x224 - * - `resnet-50 `__ - - ResNet-50_v1_ILSVRC-2012 - - classification - - 224x224 - * - `ssd-mobilenet-v1-coco `__ - - ssd-mobilenet-V1-coco onnx model - - object detection - - 300x300 - * - `ssd-resnet34-1200-onnx `__ - - ssd-resnet34 onnx model - - object detection - - 1200x1200 * - `yolov8n `__ - Yolov8nano - object detection diff --git a/docs/articles_en/about-openvino/release-notes-openvino.rst b/docs/articles_en/about-openvino/release-notes-openvino.rst index 9e7673d7d0910d..de233e6fa7cc9d 100644 --- a/docs/articles_en/about-openvino/release-notes-openvino.rst +++ b/docs/articles_en/about-openvino/release-notes-openvino.rst @@ -16,359 +16,408 @@ OpenVINO Release Notes -2024.5 - 20 November 2024 +2024.6 - 18 December 2024 ############################# :doc:`System Requirements <./release-notes-openvino/system-requirements>` | :doc:`Release policy <./release-notes-openvino/release-policy>` | :doc:`Installation Guides <./../get-started/install-openvino>` - - What's new +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -* More GenAI coverage and framework integrations to minimize code changes. - - * New models supported: Llama 3.2 (1B & 3B), Gemma 2 (2B & 9B), and YOLO11. - * LLM support on NPU: Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, Qwen2-7B-Instruct and Phi-3 - Mini-Instruct. - * Noteworthy notebooks added: Sam2, Llama3.2, Llama3.2 - Vision, Wav2Lip, Whisper, and Llava. - * Preview: support for Flax, a high-performance Python neural network library based on JAX. - Its modular design allows for easy customization and accelerated inference on GPUs. - -* Broader Large Language Model (LLM) support and more model compression techniques. - - * Optimizations for built-in GPUs on Intel® Core™ Ultra Processors (Series 1) and Intel® Arc™ - Graphics include KV Cache compression for memory reduction along with improved usability, - and model load time optimizations to improve first token latency for LLMs. - * Dynamic quantization was enabled to improve first token latency for LLMs on built-in - Intel® GPUs without impacting accuracy on Intel® Core™ Ultra Processors (Series 1). Second - token latency will also improve for large batch inference. - * A new method to generate synthetic text data is implemented in the Neural Network - Compression Framework (NNCF). This will allow LLMs to be compressed more accurately using - data-aware methods without datasets. Coming soon: This feature will soon be accessible via - Optimum Intel on Hugging Face. - -* More portability and performance to run AI at the edge, in the cloud, or locally. - - * Support for - `Intel® Xeon® 6 Processors with P-cores `__ - (formerly codenamed Granite Rapids) and - `Intel® Core™ Ultra 200V series processors `__ - (formerly codenamed Arrow Lake-S). - * Preview: GenAI API enables multimodal AI deployment with support for multimodal pipelines - for improved contextual awareness, transcription pipelines for easy audio-to-text - conversions, and image generation pipelines for streamlined text-to-visual conversions. - * Speculative decoding feature added to the GenAI API for improved performance and efficient - text generation using a small draft model that is periodically corrected by the full-size - model. - * Preview: LoRA adapters are now supported in the GenAI API for developers to quickly and - efficiently customize image and text generation models for specialized tasks. - * The GenAI API now also supports LLMs on NPU allowing developers to specify NPU as the - target device, specifically for WhisperPipeline (for whisper-base, whisper-medium, and - whisper-small) and LLMPipeline (for Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, - Qwen2-7B-Instruct and Phi-3 Mini-instruct). Use driver version 32.0.100.3104 or later for - best performance. - -Now deprecated ------------------------------------------------------------------------------------------------ +* OpenVINO 2024.6 release includes updates for enhanced stability and improved LLM performance. +* Introduced support for Intel® Arc™ B-Series Graphics (formerly known as Battlemage). +* Implemented optimizations to improve the inference time and LLM performance on NPUs. +* Improved LLM performance with GenAI API optimizations and bug fixes. -* Python 3.8 is no longer supported: OpenVINO™ Runtime +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Common ------------------------------------------------------------------------------------------------ - -* Numpy 2.x has been adopted for all currently supported components, including NNCF. -* A new constant constructor has been added, enabling constants to be created from data pointer - as shared memory. Additionally, it can take ownership of a shared, or other, object, avoiding - a two-step process to wrap memory into ``ov::Tensor``. -* Asynchronous file reading with mmap library has been implemented, reducing loading times for - model files, especially for LLMs. -* CPU implementation of SliceScatter operator is now available, used for models such as Gemma, - supporting increased LLM performance. - - CPU Device Plugin ----------------------------------------------------------------------------------------------- -* Gold support of the Intel® Xeon® 6 platform with P-cores (formerly code name Granite Rapids) - has been reached. -* Support of Intel® Core™ Ultra 200V series processors (formerly codenamed Arrow Lake-S) has - been implemented. -* LLM performance has been further improved with Rotary Position Embedding optimization; Query, - Key, and Value; and multi-layer perceptron fusion optimization. -* FP16 support has been extended with SDPA and PagedAttention, improving performance of LLM via - both native APIs and the vLLM integration. -* Models with LoRA adapters are now supported. - +* KV cache now uses asymmetric 8-bit unsigned integer (U8) as the default precision, reducing + memory stress for LLMs and increasing their performance. This option can be controlled by + model meta data. +* Quality and accuracy has been improved for selected models with several bug fixes. GPU Device Plugin ----------------------------------------------------------------------------------------------- -* The KV cache INT8 compression mechanism is now available for all supported GPUs. It enables a - significant reduction in memory consumption, increasing performance with a minimal impact to - accuracy (it affects systolic devices slightly more than non-systolic ones). The feature is - activated by default for non-systolic devices. -* LoRA adapters are now functionally supported on GPU. -* A new feature of GPU weightless blob caching enables caching model structure only and reusing - the weights from the original model file. Use the new OPTIMIZE_SIZE property to activate. -* Dynamic quantization with INT4 and INT8 precisions has been implemented and enabled by - default on Intel® Core™ Ultra platforms, improving LLM first token latency. - +* Device memory copy optimizations have been introduced for inference with **Intel® Arc™ B-Series + Graphics** (formerly known as Battlemage). Since it does not utilize L2 cache for copying memory + between the device and host, a dedicated `copy` operation is used, if inputs or results are + not expected in the device memory. +* ChatGLM4 inference on GPU has been optimized. NPU Device Plugin ----------------------------------------------------------------------------------------------- -* Models retrieved from the OpenVINO cache have a smaller memory footprint now. The plugin - releases the cached model (blob) after weights are loaded in NPU regions. Model export is not - available in this scenario. Memory consumption is reduced during inference execution with one - blob size. This optimization requires the latest NPU driver: 32.0.100.3104. -* A driver bug for ``ov::intel_npu::device_total_mem_size`` has been fixed. The plugin will now - report 2GB as the maximum allocatable memory for any driver that does not support graph - extension 1.8. Even if older drivers report a larger amount of memory to be available, memory - allocation would fail when 2GB are exceeded. Plugin reports the number that driver exposes - for any driver that supports graph extension 1.8 (or newer). -* A new API is used to initialize the model (available in graph extension 1.8). -* Inference request set_tensors is now supported. -* ``ov::device::LUID`` is now exposed on Windows. -* LLM-related improvements have been implemented in terms of both memory usage and performance. -* AvgPool and MaxPool operator support has been extended, adding support for more PyTorch models. - -* NOTE: for systems based on Intel® Core™ Ultra Processors Series 2, more than 16GB of RAM may - be required to use larger models, such as Llama-2-7B, Mistral-0.2-7B, and Qwen-2-7B - (exceeding 4B parameters) with prompt sizes over 1024 tokens. - - -OpenVINO Python API ------------------------------------------------------------------------------------------------ +* LLM performance and inference time has been improved with memory optimizations. -* Constant now can be created from openvino.Tensor. -* The “release_memory” method has been added for a compiled model, improving control over - memory consumption. -OpenVINO Node.js API ------------------------------------------------------------------------------------------------ -* Querying the best device to perform inference of a model with specific operations - is now available in JavaScript API. -* Contribution guidelines have been improved to make it easier for developers to contribute. -* Testing scope has been extended by inference in end-to-end tests. -* JavaScript API samples have been improved for readability and ease of running. +OpenVINO.GenAI ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +* The encrypted_model_causal_lm sample is now available, showing how to decrypt a model. -TensorFlow Framework Support ------------------------------------------------------------------------------------------------ -* TensorFlow 2.18.0, Keras 3.6.0, NumPy 2.0.2 in Python 3.12, and NumPy 1.26.4 in other Python - versions have been added to validation. -* Out-of-the-box conversion with static ranks has been improved by devising a new shape for - Switch-Merge condition sub-graphs. -* Complex type for the following operations is now supported: ExpandDims, Pack, Prod, Rsqrt, - ScatterNd, Sub. -* The following issues have been fixed: - * the corner case with one element in LinSpace to avoid division by zero, - * support FP16 and FP64 input types for LeakyRelu, - * support non-i32/i64 output index type for ArgMin/Max operations. +Other Changes and Known Issues ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Jupyter Notebooks +----------------------------- +* `Visual-language assistant with GLM-Edge-V and OpenVINO `__ +* `Local AI and OpenVINO `__ +* `Multimodal understanding and generation with Janus and OpenVINO `__ -PyTorch Framework Support ------------------------------------------------------------------------------------------------ -* PyTorch version 2.5 is now supported. -* OpenVINO Model Converter (OVC) now supports TorchScript and ExportedProgram saved on a drive. -* The issue of aten.index.Tensor conversion for indices with “None” values has been fixed, - helping to support the HF Stable Diffusion model in ExportedProgram format. -ONNX Framework Support ------------------------------------------------------------------------------------------------ -* ONNX version 1.17.0 is now used. -* Customers' models with DequantizeLinear-21, com.microsoft.MatMulNBits, and - com.microsoft.QuickGelu operations are now supported. -JAX/Flax Framework Support ------------------------------------------------------------------------------------------------ -* JAX 0.4.35 and Flax 0.10.0 has been added to validation. -* jax._src.core.ClosedJaxpr object conversion is now supported. -* Vision Transformer from google-research/vision_transformer is now supported - (with support for 37 new operations). -OpenVINO Model Server -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -* The OpenAI API text embedding endpoint has been added, enabling OVMS to be used as a building - block for AI applications like RAG. - `(read more) `__ -* The rerank endpoint has been added based on Cohere API, enabling easy similarity detection - between a query and a set of documents. It is one of the building blocks for AI applications - like RAG and makes integration with frameworks such as langchain easy. - `(read more) `__ -* The following improvements have been done to LLM text generation: - - * The ``echo`` sampling parameter together with ``logprobs`` in the ``completions`` endpoint - is now supported. - * Performance has been increased on both CPU and GPU. - * Throughput in high-concurrency scenarios has been increased with dynamic_split_fuse for GPU. - * Testing coverage and stability has been improved. - * The procedure for service deployment and model repository preparation has been simplified. - -* An experimental version of a Windows binary package - native model server for Windows OS - is - available. This release includes a set of limitations and has limited tests coverage. It is - intended for testing, while the production-ready release is expected with 2025.0. All feedback - is welcome. - - -Neural Network Compression Framework -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -* A new nncf.data.generate_text_data() method has been added for generating a synthetic dataset - for LLM compression. This approach helps to compress LLMs more accurately in situations when - the dataset is not available or not sufficient. - `See our example `__ - for more information about the usage. -* Support of data-free and data-aware weight compression methods - nncf.compress_weights() - - has been extended with NF4 per-channel quantization, making compressed LLMs more accurate and - faster on NPU. -* Caching of computed statistics in nncf.compress_weights() is now available, significantly - reducing compression time when performing compression of the same LLM multiple times, with - different compression parameters. To enable it, set the advanced ``statistics_path`` parameter - of nncf.compress_weights() to the desired file path location. -* The ``backup_mode`` optional parameter has been added to nncf.compress_weights(), for - specifying the data type for embeddings, convolutions, and last linear layers during 4-bit - weight compression. Available options are INT8_ASYM (default), INT8_SYM, and NONE (retains - the original floating-point precision of the model weights). In certain situations, - non-default value might give better accuracy of compressed LLMs. -* Preview support is now available for optimizing models in Torch - `FX format `__, nncf.quantize(), and - nncf.compress_weights() methods. After optimization such models can be directly executed - via torch.compile(compressed_model, backend="openvino"). For more details, see - `INT8 quantization example `__. -* Memory consumption of data-aware weight compression methods - nncf.compress_weights() – has - been reduced significantly, with some variation depending on the model and method. -* Support for the following has changed: - - * NumPy 2 added - * PyTorch upgraded to 2.5.1 - * ONNX upgraded to 1.17 - * Python 3.8 discontinued - - - -OpenVINO Tokenizers +Previous 2024 releases +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -* Several operations have been introduced and optimized. -* Conversion parameters and environment info have been added to ``rt_info``, improving - reproducibility and debugging. +.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +.. dropdown:: 2024.5 - 20 November 2024 + :animate: fade-in-slide-down + :color: secondary -OpenVINO.GenAI -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + **What's new** -* The following has been added: + * More GenAI coverage and framework integrations to minimize code changes. - * LoRA adapter for the LLMPipeline. - * Text2ImagePipeline with LoRA adapter and text2image samples. - * VLMPipeline and visual_language_chat sample for text generation models with text and image - inputs. - * WhisperPipeline and whisper_speech_recognition sample. + * New models supported: Llama 3.2 (1B & 3B), Gemma 2 (2B & 9B), and YOLO11. + * LLM support on NPU: Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, Qwen2-7B-Instruct and Phi-3 + Mini-Instruct. + * Noteworthy notebooks added: Sam2, Llama3.2, Llama3.2 - Vision, Wav2Lip, Whisper, and Llava. + * Preview: support for Flax, a high-performance Python neural network library based on JAX. + Its modular design allows for easy customization and accelerated inference on GPUs. -* speculative_decoding_lm has been moved to LLMPipeline based implementation and is now - installed as part of the package. -* On NPU, a set of pipelines has been enabled: WhisperPipeline (for whisper-base, - whisper-medium, and whisper-small), LLMPipeline (for Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, - Qwen2-7B-Instruct, and Phi-3 Mini-instruct). Use driver version 32.0.100.3104 or later for - best performance. + * Broader Large Language Model (LLM) support and more model compression techniques. + * Optimizations for built-in GPUs on Intel® Core™ Ultra Processors (Series 1) and Intel® Arc™ + Graphics include KV Cache compression for memory reduction along with improved usability, + and model load time optimizations to improve first token latency for LLMs. + * Dynamic quantization was enabled to improve first token latency for LLMs on built-in + Intel® GPUs without impacting accuracy on Intel® Core™ Ultra Processors (Series 1). Second + token latency will also improve for large batch inference. + * A new method to generate synthetic text data is implemented in the Neural Network + Compression Framework (NNCF). This will allow LLMs to be compressed more accurately using + data-aware methods without datasets. Coming soon: This feature will soon be accessible via + Optimum Intel on Hugging Face. + * More portability and performance to run AI at the edge, in the cloud, or locally. + * Support for + `Intel® Xeon® 6 Processors with P-cores `__ + (formerly codenamed Granite Rapids) and + `Intel® Core™ Ultra 200V series processors `__ + (formerly codenamed Arrow Lake-S). + * Preview: GenAI API enables multimodal AI deployment with support for multimodal pipelines + for improved contextual awareness, transcription pipelines for easy audio-to-text + conversions, and image generation pipelines for streamlined text-to-visual conversions. + * Speculative decoding feature added to the GenAI API for improved performance and efficient + text generation using a small draft model that is periodically corrected by the full-size + model. + * Preview: LoRA adapters are now supported in the GenAI API for developers to quickly and + efficiently customize image and text generation models for specialized tasks. + * The GenAI API now also supports LLMs on NPU allowing developers to specify NPU as the + target device, specifically for WhisperPipeline (for whisper-base, whisper-medium, and + whisper-small) and LLMPipeline (for Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, + Qwen2-7B-Instruct and Phi-3 Mini-instruct). Use driver version 32.0.100.3104 or later for + best performance. + *Now deprecated* -Other Changes and Known Issues -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * Python 3.8 is no longer supported: -Jupyter Notebooks ------------------------------ -* `Text-to-Image generation using OpenVINO GenAI `__ -* `Multi LoRA Image Generation `__ -* `Virtual Try-on using OpenVINO and CatVTON `__ -* `Visual Language Assistant using OpenVINO GenAI `__ -* `Speech recognition using OpenVINO GenAI `__ -* `YoloV11 `__ -* `Llama-3.2-vision `__ -* `Pixtral `__ -* `Segment Anything 2 `__ -* `Video Lips-sync using Wav2Lip `__ -* `Convert JAX to OpenVINO tutorial `__ + **OpenVINO™ Runtime** + *Common* -Known Issues ------------------------------ + * Numpy 2.x has been adopted for all currently supported components, including NNCF. + * A new constant constructor has been added, enabling constants to be created from data pointer + as shared memory. Additionally, it can take ownership of a shared, or other, object, avoiding + a two-step process to wrap memory into ``ov::Tensor``. + * Asynchronous file reading with mmap library has been implemented, reducing loading times for + model files, especially for LLMs. + * CPU implementation of SliceScatter operator is now available, used for models such as Gemma, + supporting increased LLM performance. -| **Component: CPU Plugin** -| ID: 155898 -| Description: -| Description: When using new version of Transformer version to convert some of LLMs - (GPT-J/GPT-NeoX or falcon-7b), the inference accuracy may be impacted on 4th or 5th - generation of Intel® Xeon® processors, due to model structure update triggering inference - precision difference in part of the model. The workaround is to use transformer version of - 4.44.2 or lower. -| **Component: GPU Plugin** -| ID: 154583 -| Description: -| LLM accuracy can be low especially on non-systolic platforms like Intel® Core™ Ultra. When - facing the low accuracy issue, user needs to manually set a config ACTIVATION_SCALING_FACOTR - with a value of 8.0 in the compile_model() function. From the next release, scaling factor - value will be automatically applied through updated IR. + *CPU Device Plugin* -| **Component: GenAI** -| ID: 156437, 148933 -| Description: -| When using Python GenAI APIs, if ONNX 17.0 and later is installed, it may encounter the - error “DLL load failed while importing onnx_cpp2py_export: A dynamic link library (DLL) - initialization routine failed.” It is due to the ONNX dependency issue - `onnx/onnx#6267 `__, - Install - `Microsoft Visual C++ Redistributable `__ - latest supported downloads to fix the issue. + * Gold support of the Intel® Xeon® 6 platform with P-cores (formerly code name Granite Rapids) + has been reached. + * Support of Intel® Core™ Ultra 200V series processors (formerly codenamed Arrow Lake-S) has + been implemented. + * LLM performance has been further improved with Rotary Position Embedding optimization; Query, + Key, and Value; and multi-layer perceptron fusion optimization. + * FP16 support has been extended with SDPA and PagedAttention, improving performance of LLM via + both native APIs and the vLLM integration. + * Models with LoRA adapters are now supported. -| **Component: GenAI** -| ID: 156944 -| Description: -| There were backward incompatible changes resulting in different text generated by LLMs like - Mistralai/Mistral-7B-Instruct-v0.2 and TinyLlama/TinyLlama-1.1B-Chat-v1.0 when using a - tokenizer converted by older openvino_tolenizers. A way to resolve the issue is to convert - tokenizer and detokenizer models using the latest openvino_tokenizers. + *GPU Device Plugin* + * The KV cache INT8 compression mechanism is now available for all supported GPUs. It enables a + significant reduction in memory consumption, increasing performance with a minimal impact to + accuracy (it affects systolic devices slightly more than non-systolic ones). The feature is + activated by default for non-systolic devices. + * LoRA adapters are now functionally supported on GPU. + * A new feature of GPU weightless blob caching enables caching model structure only and reusing + the weights from the original model file. Use the new OPTIMIZE_SIZE property to activate. + * Dynamic quantization with INT4 and INT8 precisions has been implemented and enabled by + default on Intel® Core™ Ultra platforms, improving LLM first token latency. + *NPU Device Plugin* + + * Models retrieved from the OpenVINO cache have a smaller memory footprint now. The plugin + releases the cached model (blob) after weights are loaded in NPU regions. Model export is not + available in this scenario. Memory consumption is reduced during inference execution with one + blob size. This optimization requires the latest NPU driver: 32.0.100.3104. + * A driver bug for ``ov::intel_npu::device_total_mem_size`` has been fixed. The plugin will now + report 2GB as the maximum allocatable memory for any driver that does not support graph + extension 1.8. Even if older drivers report a larger amount of memory to be available, memory + allocation would fail when 2GB are exceeded. Plugin reports the number that driver exposes + for any driver that supports graph extension 1.8 (or newer). + * A new API is used to initialize the model (available in graph extension 1.8). + * Inference request set_tensors is now supported. + * ``ov::device::LUID`` is now exposed on Windows. + * LLM-related improvements have been implemented in terms of both memory usage and performance. + * AvgPool and MaxPool operator support has been extended, adding support for more PyTorch models. + + * NOTE: for systems based on Intel® Core™ Ultra Processors Series 2, more than 16GB of RAM may + be required to use larger models, such as Llama-2-7B, Mistral-0.2-7B, and Qwen-2-7B + (exceeding 4B parameters) with prompt sizes over 1024 tokens. + + + *OpenVINO Python API* + + * Constant now can be created from openvino.Tensor. + * The “release_memory” method has been added for a compiled model, improving control over + memory consumption. + + + + *OpenVINO Node.js API* + + * Querying the best device to perform inference of a model with specific operations + is now available in JavaScript API. + * Contribution guidelines have been improved to make it easier for developers to contribute. + * Testing scope has been extended by inference in end-to-end tests. + * JavaScript API samples have been improved for readability and ease of running. + + + + *TensorFlow Framework Support* + + * TensorFlow 2.18.0, Keras 3.6.0, NumPy 2.0.2 in Python 3.12, and NumPy 1.26.4 in other Python + versions have been added to validation. + * Out-of-the-box conversion with static ranks has been improved by devising a new shape for + Switch-Merge condition sub-graphs. + * Complex type for the following operations is now supported: ExpandDims, Pack, Prod, Rsqrt, + ScatterNd, Sub. + * The following issues have been fixed: + + * the corner case with one element in LinSpace to avoid division by zero, + * support FP16 and FP64 input types for LeakyRelu, + * support non-i32/i64 output index type for ArgMin/Max operations. + + + + *PyTorch Framework Support* + + * PyTorch version 2.5 is now supported. + * OpenVINO Model Converter (OVC) now supports TorchScript and ExportedProgram saved on a drive. + * The issue of aten.index.Tensor conversion for indices with “None” values has been fixed, + helping to support the HF Stable Diffusion model in ExportedProgram format. + + + + *ONNX Framework Support* + + * ONNX version 1.17.0 is now used. + * Customers' models with DequantizeLinear-21, com.microsoft.MatMulNBits, and + com.microsoft.QuickGelu operations are now supported. + + *JAX/Flax Framework Support* + + * JAX 0.4.35 and Flax 0.10.0 has been added to validation. + * jax._src.core.ClosedJaxpr object conversion is now supported. + * Vision Transformer from google-research/vision_transformer is now supported + (with support for 37 new operations). + + + **OpenVINO Model Server** + + * The OpenAI API text embedding endpoint has been added, enabling OVMS to be used as a building + block for AI applications like RAG. + `(read more) `__ + * The rerank endpoint has been added based on Cohere API, enabling easy similarity detection + between a query and a set of documents. It is one of the building blocks for AI applications + like RAG and makes integration with frameworks such as langchain easy. + `(read more) `__ + * The following improvements have been done to LLM text generation: + + * The ``echo`` sampling parameter together with ``logprobs`` in the ``completions`` endpoint + is now supported. + * Performance has been increased on both CPU and GPU. + * Throughput in high-concurrency scenarios has been increased with dynamic_split_fuse for GPU. + * Testing coverage and stability has been improved. + * The procedure for service deployment and model repository preparation has been simplified. + + * An experimental version of a Windows binary package - native model server for Windows OS - is + available. This release includes a set of limitations and has limited tests coverage. It is + intended for testing, while the production-ready release is expected with 2025.0. All feedback + is welcome. + + + **Neural Network Compression Framework** + + * A new nncf.data.generate_text_data() method has been added for generating a synthetic dataset + for LLM compression. This approach helps to compress LLMs more accurately in situations when + the dataset is not available or not sufficient. + `See our example `__ + for more information about the usage. + * Support of data-free and data-aware weight compression methods - nncf.compress_weights() - + has been extended with NF4 per-channel quantization, making compressed LLMs more accurate and + faster on NPU. + * Caching of computed statistics in nncf.compress_weights() is now available, significantly + reducing compression time when performing compression of the same LLM multiple times, with + different compression parameters. To enable it, set the advanced ``statistics_path`` parameter + of nncf.compress_weights() to the desired file path location. + * The ``backup_mode`` optional parameter has been added to nncf.compress_weights(), for + specifying the data type for embeddings, convolutions, and last linear layers during 4-bit + weight compression. Available options are INT8_ASYM (default), INT8_SYM, and NONE (retains + the original floating-point precision of the model weights). In certain situations, + non-default value might give better accuracy of compressed LLMs. + * Preview support is now available for optimizing models in Torch + `FX format `__, nncf.quantize(), and + nncf.compress_weights() methods. After optimization such models can be directly executed + via torch.compile(compressed_model, backend="openvino"). For more details, see + `INT8 quantization example `__. + * Memory consumption of data-aware weight compression methods - nncf.compress_weights() – has + been reduced significantly, with some variation depending on the model and method. + * Support for the following has changed: + + * NumPy 2 added + * PyTorch upgraded to 2.5.1 + * ONNX upgraded to 1.17 + * Python 3.8 discontinued + + + + **OpenVINO Tokenizers** + + * Several operations have been introduced and optimized. + * Conversion parameters and environment info have been added to ``rt_info``, improving + reproducibility and debugging. + + + + **OpenVINO.GenAI** + + * The following has been added: + + * LoRA adapter for the LLMPipeline. + * Text2ImagePipeline with LoRA adapter and text2image samples. + * VLMPipeline and visual_language_chat sample for text generation models with text and image + inputs. + * WhisperPipeline and whisper_speech_recognition sample. + + * speculative_decoding_lm has been moved to LLMPipeline based implementation and is now + installed as part of the package. + * On NPU, a set of pipelines has been enabled: WhisperPipeline (for whisper-base, + whisper-medium, and whisper-small), LLMPipeline (for Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, + Qwen2-7B-Instruct, and Phi-3 Mini-instruct). Use driver version 32.0.100.3104 or later for + best performance. + + + + + + **Other Changes and Known Issues** + + *Jupyter Notebooks* + + * `Text-to-Image generation using OpenVINO GenAI `__ + * `Multi LoRA Image Generation `__ + * `Virtual Try-on using OpenVINO and CatVTON `__ + * `Visual Language Assistant using OpenVINO GenAI `__ + * `Speech recognition using OpenVINO GenAI `__ + * `YoloV11 `__ + * `Llama-3.2-vision `__ + * `Pixtral `__ + * `Segment Anything 2 `__ + * `Video Lips-sync using Wav2Lip `__ + * `Convert JAX to OpenVINO tutorial `__ + + + *Known Issues* + + | **Component: CPU Plugin** + | ID: 155898 + | Description: + | Description: When using new version of Transformer version to convert some of LLMs + (GPT-J/GPT-NeoX or falcon-7b), the inference accuracy may be impacted on 4th or 5th + generation of Intel® Xeon® processors, due to model structure update triggering inference + precision difference in part of the model. The workaround is to use transformer version of + 4.44.2 or lower. + + | **Component: GPU Plugin** + | ID: 154583 + | Description: + | LLM accuracy can be low especially on non-systolic platforms like Intel® Core™ Ultra. When + facing the low accuracy issue, user needs to manually set a config ACTIVATION_SCALING_FACOTR + with a value of 8.0 in the compile_model() function. From the next release, scaling factor + value will be automatically applied through updated IR. + + | **Component: GenAI** + | ID: 156437, 148933 + | Description: + | When using Python GenAI APIs, if ONNX 17.0 and later is installed, it may encounter the + error “DLL load failed while importing onnx_cpp2py_export: A dynamic link library (DLL) + initialization routine failed.” It is due to the ONNX dependency issue + `onnx/onnx#6267 `__, + Install + `Microsoft Visual C++ Redistributable `__ + latest supported downloads to fix the issue. + + | **Component: GenAI** + | ID: 156944 + | Description: + | There were backward incompatible changes resulting in different text generated by LLMs like + Mistralai/Mistral-7B-Instruct-v0.2 and TinyLlama/TinyLlama-1.1B-Chat-v1.0 when using a + tokenizer converted by older openvino_tolenizers. A way to resolve the issue is to convert + tokenizer and detokenizer models using the latest openvino_tokenizers. -Previous 2024 releases -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -472,20 +521,20 @@ Previous 2024 releases *NPU Device Plugin* -* `Remote Tensor API `__ - is now supported. -* You can now query the available number of tiles (ov::intel_npu::max_tiles) and force a - specific number of tiles to be used by the model, per inference request - (ov::intel_npu::tiles). **Note:** ov::intel_npu::tiles overrides the default number of tiles - selected by the compiler based on performance hints (ov::hint::performance_mode). Any tile - number other than 1 may be a problem for cross platform compatibility, if not tested - explicitly versus the max_tiles value. -* You can now bypass the model caching mechanism in the driver - (ov::intel_npu::bypass_umd_caching). Read more about driver and OpenVINO caching. -* Memory footprint at model execution has been reduced by one blob (compiled model) size. - For execution, the plugin no longer retrieves the compiled model from the driver, it uses the - level zero graph handle directly, instead. The compiled model is now retrieved from the driver - only during the export method. + * `Remote Tensor API `__ + is now supported. + * You can now query the available number of tiles (ov::intel_npu::max_tiles) and force a + specific number of tiles to be used by the model, per inference request + (ov::intel_npu::tiles). **Note:** ov::intel_npu::tiles overrides the default number of tiles + selected by the compiler based on performance hints (ov::hint::performance_mode). Any tile + number other than 1 may be a problem for cross platform compatibility, if not tested + explicitly versus the max_tiles value. + * You can now bypass the model caching mechanism in the driver + (ov::intel_npu::bypass_umd_caching). Read more about driver and OpenVINO caching. + * Memory footprint at model execution has been reduced by one blob (compiled model) size. + For execution, the plugin no longer retrieves the compiled model from the driver, it uses the + level zero graph handle directly, instead. The compiled model is now retrieved from the driver + only during the export method. *OpenVINO Python API* @@ -1620,7 +1669,7 @@ Deprecation And Support Using deprecated features and components is not advised. They are available to enable a smooth transition to new solutions and will be discontinued in the future. To keep using discontinued features, you will have to revert to the last LTS OpenVINO version supporting them. -For more details, refer to the :doc:`OpenVINO Legacy Features and Components <../documentation/legacy-features>` +For more details, refer to the `OpenVINO Legacy Features and Components __` page. Discontinued in 2024 @@ -1678,7 +1727,7 @@ Deprecated and to be removed in the future * Model Optimizer will be discontinued with OpenVINO 2025.0. Consider using the :doc:`new conversion methods <../openvino-workflow/model-preparation/convert-model-to-ir>` instead. For more details, see the - :doc:`model conversion transition guide <../documentation/legacy-features/transition-legacy-conversion-api>`. + `model conversion transition guide `__. * OpenVINO property Affinity API will be discontinued with OpenVINO 2025.0. It will be replaced with CPU binding configurations (``ov::hint::enable_cpu_pinning``). * OpenVINO Model Server components: @@ -1707,10 +1756,6 @@ Deprecated and to be removed in the future * See alternative: `Machine Translation Python* Demo `__ - * `Open Model Zoo Tools Tutorial `__ - - * No alternatives, demonstrates deprecated tools. - * `Super Resolution with OpenVINO™ `__ * See alternative: `Super Resolution with PaddleGAN and OpenVINO `__ @@ -1811,6 +1856,4 @@ Copyright © 2024, Intel Corporation. All rights reserved. For more complete information about compiler optimizations, see our Optimization Notice. -Performance varies by use, configuration and other factors. - - +Performance varies by use, configuration and other factors. \ No newline at end of file diff --git a/docs/articles_en/about-openvino/release-notes-openvino/release-policy.rst b/docs/articles_en/about-openvino/release-notes-openvino/release-policy.rst index 44ca052ee8e7b9..34107c60b73139 100644 --- a/docs/articles_en/about-openvino/release-notes-openvino/release-policy.rst +++ b/docs/articles_en/about-openvino/release-notes-openvino/release-policy.rst @@ -179,7 +179,7 @@ Additional Information * Binary distribution: * Download from `OpenVINO storage `__ - * `pypi.org `__ + * `pypi.org `__ * `DockerHub* `__ diff --git a/docs/articles_en/assets/images/MO_connection_example_1.svg b/docs/articles_en/assets/images/MO_connection_example_1.svg deleted file mode 100644 index 9e975041032891..00000000000000 --- a/docs/articles_en/assets/images/MO_connection_example_1.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd1e2d8f82ce07f5d463d6480293935443785979fe16b555cd8e60fb2f253928 -size 55232 diff --git a/docs/articles_en/assets/images/MO_conversion_pipeline.svg b/docs/articles_en/assets/images/MO_conversion_pipeline.svg deleted file mode 100644 index e0448b06dda139..00000000000000 --- a/docs/articles_en/assets/images/MO_conversion_pipeline.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db6f798882e0301f0cf83f1eba90560b5151266612fef2bc5f16a12cf192f0a0 -size 128446 diff --git a/docs/articles_en/assets/images/MO_graph_after_extractors.svg b/docs/articles_en/assets/images/MO_graph_after_extractors.svg deleted file mode 100644 index 7ee1ebe7c1761a..00000000000000 --- a/docs/articles_en/assets/images/MO_graph_after_extractors.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e9d5ee3d23d232fc10072189c0bf18d76f5d5d7217091d81a1ac465d129c034e -size 88648 diff --git a/docs/articles_en/assets/images/MO_graph_after_loader.svg b/docs/articles_en/assets/images/MO_graph_after_loader.svg deleted file mode 100644 index 380db77679be7f..00000000000000 --- a/docs/articles_en/assets/images/MO_graph_after_loader.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e882e25b5117e4d17a3b94944f58470c0337fafa5afc2ec6aa01f498c442c5f3 -size 73933 diff --git a/docs/articles_en/assets/images/MO_graph_before_partial_inference.svg b/docs/articles_en/assets/images/MO_graph_before_partial_inference.svg deleted file mode 100644 index b312a0314b0b55..00000000000000 --- a/docs/articles_en/assets/images/MO_graph_before_partial_inference.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7799a6c30352fa74d7d98f993d9ad7b148d975d96778762df410d69133abf8a8 -size 158171 diff --git a/docs/articles_en/assets/images/MO_ports_example_1.svg b/docs/articles_en/assets/images/MO_ports_example_1.svg deleted file mode 100644 index 778ee6fd3ecb7a..00000000000000 --- a/docs/articles_en/assets/images/MO_ports_example_1.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8340d5ca434fe74d19f397c1acd0c92b4ad3b16a563975dc1603a6bf8ef03eb6 -size 55262 diff --git a/docs/articles_en/assets/images/MO_ports_example_2.svg b/docs/articles_en/assets/images/MO_ports_example_2.svg deleted file mode 100644 index 288ce970b3664f..00000000000000 --- a/docs/articles_en/assets/images/MO_ports_example_2.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aed3820019aa5b9d4741c146bd4596e6850ea714e6e44fefe6cccf4707e5f152 -size 55270 diff --git a/docs/articles_en/assets/images/MO_transformations_graph.svg b/docs/articles_en/assets/images/MO_transformations_graph.svg deleted file mode 100644 index 093365f92a8e8d..00000000000000 --- a/docs/articles_en/assets/images/MO_transformations_graph.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edbc2911e5aa5a672d8ebaf82b3d06f6915e44b8760ac18f88fba1d2e99fddd6 -size 349693 diff --git a/docs/articles_en/assets/images/deploy_encrypted_model.svg b/docs/articles_en/assets/images/deploy_encrypted_model.svg index 61d0dbe710994e..fa897731b54fef 100644 --- a/docs/articles_en/assets/images/deploy_encrypted_model.svg +++ b/docs/articles_en/assets/images/deploy_encrypted_model.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f802b1396fafdc8a80c03c4931d4b6290cc10451961ddba5edcef1c8227833b -size 44097 +oid sha256:454a531a9b2d2883ac9a6beb01ce7ecdd7ec69ea2c68d63b39b65f3780c957fe +size 54772 diff --git a/docs/articles_en/assets/images/genai_main_diagram.svg b/docs/articles_en/assets/images/genai_main_diagram.svg new file mode 100644 index 00000000000000..b01cbd827acb3c --- /dev/null +++ b/docs/articles_en/assets/images/genai_main_diagram.svg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07ce964e115f1e3942cdf381f44b4dc6d466df62c70396306a4f241fb07ea3ed +size 392244 diff --git a/docs/articles_en/assets/images/training_extensions_framework.png b/docs/articles_en/assets/images/training_extensions_framework.png index 3cbbac7fdbfba8..b518aa584a96fc 100644 --- a/docs/articles_en/assets/images/training_extensions_framework.png +++ b/docs/articles_en/assets/images/training_extensions_framework.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b3932d0cf0071c629e1013f3e17a9f8abda800eb01c50b3e826a42127e42da7 -size 48770 +oid sha256:4c8069733dbd51ff2bd47b47e7d2a7083dac55d9faf66dfb61b897d65eb0a545 +size 47828 diff --git a/docs/articles_en/assets/snippets/lpt_intel_cpu_plugin.cpp b/docs/articles_en/assets/snippets/lpt_intel_cpu_plugin.cpp index d9e41bc77eec17..76e6d60b8e3e90 100644 --- a/docs/articles_en/assets/snippets/lpt_intel_cpu_plugin.cpp +++ b/docs/articles_en/assets/snippets/lpt_intel_cpu_plugin.cpp @@ -38,7 +38,7 @@ auto defaultPrecisions = useLpt ? ov::pass::low_precision::precision_set::get_int8_support() : std::vector{}; if (useLpt) { // disable constant folding on dequantization subgraphs so they can be processed by LPT - manager.register_pass(defaultPrecisions); + manager.register_pass(defaultPrecisions); } // OpenVINO common transformations happen here diff --git a/docs/articles_en/documentation.rst b/docs/articles_en/documentation.rst index 5be7bb9dbc30fb..c1dd34f5373429 100644 --- a/docs/articles_en/documentation.rst +++ b/docs/articles_en/documentation.rst @@ -13,7 +13,6 @@ Documentation API Reference OpenVINO IR format and Operation Sets - Legacy Features Tool Ecosystem OpenVINO Extensibility OpenVINO™ Security diff --git a/docs/articles_en/documentation/legacy-features.rst b/docs/articles_en/documentation/legacy-features.rst deleted file mode 100644 index 2457d28cf24c15..00000000000000 --- a/docs/articles_en/documentation/legacy-features.rst +++ /dev/null @@ -1,130 +0,0 @@ -Legacy Features and Components -============================== - -.. meta:: - :description: A list of deprecated OpenVINO™ components. - -.. toctree:: - :maxdepth: 1 - :hidden: - - OpenVINO Development Tools package - Model Optimizer / Conversion API - Open Model ZOO - legacy-features/multi-device - - -Since OpenVINO has grown very rapidly in recent years, a number of its features -and components have been replaced by other solutions. Some of them are still -supported to assure OpenVINO users are given enough time to adjust their projects, -before the features are fully discontinued. - -This section will give you an overview of these major changes and tell you how -you can proceed to get the best experience and results with the current OpenVINO -offering. - - -| **OpenVINO Development Tools Package** -| *New solution:* OpenVINO Runtime includes all supported components -| *Old solution:* discontinuation planned for OpenVINO 2025.0 -| -| OpenVINO Development Tools used to be the OpenVINO package with tools for - advanced operations on models, such as Model conversion API, Benchmark Tool, - Accuracy Checker, Annotation Converter, Post-Training Optimization Tool, - and Open Model Zoo tools. Most of these tools have been either removed, - replaced by other solutions, or moved to the OpenVINO Runtime package. -| :doc:`See how to install Development Tools ` - - -| **Model Optimizer / Conversion API** -| *New solution:* Direct model support and OpenVINO Converter (OVC) -| *Old solution:* Legacy Conversion API discontinuation planned for OpenVINO 2025.0 -| -| The role of Model Optimizer and later the Conversion API was largely reduced - when all major model frameworks became supported directly. For converting model - files explicitly, it has been replaced with a more light-weight and efficient - solution, the OpenVINO Converter (launched with OpenVINO 2023.1). -| :doc:`See how to use OVC <../openvino-workflow/model-preparation>` -| :doc:`See how to transition from the legacy solution ` - - -| **Open Model ZOO** -| *New solution:* users are encouraged to use public model repositories -| *Old solution:* discontinuation planned for OpenVINO 2025.0 -| -| Open Model ZOO provided a collection of models prepared for use with OpenVINO, - and a small set of tools enabling a level of automation for the process. - Since the tools have been mostly replaced by other solutions and several - other model repositories have recently grown in size and popularity, - Open Model ZOO will no longer be maintained. You may still use its resources - until they are fully removed. -| :doc:`See the Open Model ZOO documentation ` -| `Check the OMZ GitHub project `__ -| As for public model databases, `Hugging Face `__ has - become the recommended model source for OpenVINO. - - -| **Multi-Device Execution** -| *New solution:* Automatic Device Selection -| *Old solution:* Legacy Multi-Device Execution discontinuation planned for OpenVINO 2025.0 -| -| The behavior and results of the Multi-Device Execution mode are covered by the ``CUMULATIVE_THROUGHPUT`` - option of the Automatic Device Selection. The only difference is that ``CUMULATIVE_THROUGHPUT`` uses - the devices specified by AUTO, which means that adding devices manually is not mandatory, - while with MULTI, the devices had to be specified before the inference. -| :doc:`Check the Automatic Device Selection <../openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection>` -| :doc:`Check the legacy solution ` - -Discontinued: -############# - -.. dropdown:: Caffe, and Kaldi model formats - - | *New solution:* conversion to ONNX via external tools - | *Old solution:* model support discontinued with OpenVINO 2024.0 - | `The last version supporting Apache MXNet, Caffe, and Kaldi model formats `__ - | :doc:`See the currently supported frameworks <../openvino-workflow/model-preparation>` - -.. dropdown:: Post-training Optimization Tool (POT) - - | *New solution:* Neural Network Compression Framework (NNCF) now offers the same functionality - | *Old solution:* POT discontinued with OpenVINO 2024.0 - | :doc:`See how to use NNCF for model optimization <../openvino-workflow/model-optimization>` - | `Check the NNCF GitHub project, including documentation `__ - -.. dropdown:: Inference API 1.0 - - | *New solution:* API 2.0 launched in OpenVINO 2022.1 - | *Old solution:* discontinued with OpenVINO 2024.0 - | `2023.2 is the last version supporting API 1.0 `__ - -.. dropdown:: Compile tool - - | *New solution:* the tool is no longer needed - | *Old solution:* discontinued with OpenVINO 2023.0 - | If you need to compile a model for inference on a specific device, use the following script: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. doxygensnippet:: docs/articles_en/assets/snippets/export_compiled_model.py - :language: python - :fragment: [export_compiled_model] - - .. tab-item:: C++ - :sync: cpp - - .. doxygensnippet:: docs/articles_en/assets/snippets/export_compiled_model.cpp - :language: cpp - :fragment: [export_compiled_model] - -.. dropdown:: TensorFlow integration (OVTF) - - | *New solution:* Direct model support and OpenVINO Converter (OVC) - | *Old solution:* discontinued in OpenVINO 2023.0 - | - | OpenVINO now features a native TensorFlow support, with no need for explicit model - conversion. - diff --git a/docs/articles_en/documentation/legacy-features/install-dev-tools.rst b/docs/articles_en/documentation/legacy-features/install-dev-tools.rst deleted file mode 100644 index 4b0160e11c9082..00000000000000 --- a/docs/articles_en/documentation/legacy-features/install-dev-tools.rst +++ /dev/null @@ -1,259 +0,0 @@ -Install OpenVINO™ Development Tools -===================================== - - -.. meta:: - :description: Learn how to install OpenVINO™ Development Tools on Windows, - Linux, and macOS operating systems, using a PyPi package. - -OpenVINO Development Tools is a set of utilities that make it easy to develop and -optimize models and applications for OpenVINO. It provides the following tools: - -* Model conversion API -* Benchmark Tool -* Accuracy Checker and Annotation Converter -* Model Downloader and other Open Model Zoo tools - -The instructions on this page show how to install OpenVINO Development Tools. If you are a -Python developer, it only takes a few simple steps to install the tools with PyPI. If you -are developing in C/C++, OpenVINO Runtime must be installed separately before installing -OpenVINO Development Tools. - -In both cases, Python 3.9 - 3.12 needs to be installed on your system before starting. - -.. note:: - - From the 2022.1 release, the OpenVINO™ Development Tools can only be installed via PyPI. - -.. _python_developers: - -For Python Developers -##################### - -If you are a Python developer, follow the steps in the -:ref:`Installing OpenVINO Development Tools ` section on this page to -install it. Installing OpenVINO Development Tools will also install OpenVINO Runtime as -a dependency, so you don’t need to install OpenVINO Runtime separately. This option is -recommended for new users. - -.. _cpp_developers: - -For C/C++ Developers -####################### - -If you are a C/C++ developer, you must first install OpenVINO Runtime separately to set -up the C/C++ libraries, sample code, and dependencies for building applications with -OpenVINO. These files are not included with the PyPI distribution. See the -:doc:`Selector Tool <../../get-started/install-openvino>` page to install OpenVINO Runtime -from an archive file for your operating system. - -Once OpenVINO Runtime is installed, you may install OpenVINO Development Tools for access -to tools like ``mo``, Model Downloader, Benchmark Tool, and other utilities that will help -you optimize your model and develop your application. Follow the steps in the -:ref:`Installing OpenVINO Development Tools ` section on this page -to install it. - -.. _install_dev_tools: - -Installing OpenVINO™ Development Tools -###################################### - -Follow these step-by-step instructions to install OpenVINO Development Tools on your computer. -There are two options to install OpenVINO Development Tools: installation into an existing -environment with a deep learning framework that was used for model training or creation; -or installation into a new environment. - -Installation into an Existing Environment with the Source Deep Learning Framework -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -To install OpenVINO Development Tools (see the :ref:`Install the Package ` -section of this article) into an existing environment with the deep learning framework used -for the model training or creation, run the following command: - -.. code-block:: sh - - pip install openvino-dev - - -Installation in a New Environment -+++++++++++++++++++++++++++++++++ - -If you do not have an environment with a deep learning framework for the input model or you -encounter any compatibility issues between OpenVINO and your version of deep learning -framework, you may install OpenVINO Development Tools with validated versions of -frameworks into a new environment. - -Step 1. Set Up Python Virtual Environment ------------------------------------------ - -Create a virtual Python environment to avoid dependency conflicts. To create a virtual -environment, use the following command: - -.. tab-set:: - - .. tab-item:: Windows - :sync: windows - - .. code-block:: sh - - python -m venv openvino_env - - .. tab-item:: Linux and macOS - :sync: linux-and-macos - - .. code-block:: sh - - python3 -m venv openvino_env - - - -Step 2. Activate Virtual Environment ------------------------------------- - -Activate the newly created Python virtual environment by issuing this command: - -.. tab-set:: - - .. tab-item:: Windows - :sync: windows - - .. code-block:: sh - - openvino_env\Scripts\activate - - .. tab-item:: Linux and macOS - :sync: linux-and-macos - - .. code-block:: sh - - source openvino_env/bin/activate - -.. important:: - - The above command must be re-run every time a new command terminal window is opened. - - -Step 3. Set Up and Update PIP to the Highest Version ----------------------------------------------------- - -Make sure `pip` is installed in your environment and upgrade it to the latest version by -issuing the following command: - -.. code-block:: sh - - python -m pip install --upgrade pip - - -.. _install_the_package: - -Step 4. Install the Package ---------------------------- - -To install and configure the components of the development package together with validated -versions of specific frameworks, use the commands below. - -.. code-block:: sh - - pip install openvino-dev[extras] - - -where the ``extras`` parameter specifies the source deep learning framework for the input model -and is one or more of the following values separated with "," : ``onnx``, ``pytorch``, -``tensorflow``, ``tensorflow2``. - -For example, to install and configure dependencies required for working with TensorFlow 2.x -and ONNX models, use the following command: - -.. code-block:: sh - - pip install openvino-dev[tensorflow2,onnx] - - -.. note:: - - Model conversion API support for TensorFlow 1.x environment has been deprecated. Use the - ``tensorflow2`` parameter to install a TensorFlow 2.x environment that can convert both - TensorFlow 1.x and 2.x models. If your model isn't compatible with the TensorFlow 2.x - environment, use the `tensorflow` parameter to install the TensorFlow 1.x environment. - The TF 1.x environment is provided only for legacy compatibility reasons. - -For more details on the openvino-dev PyPI package, see -`pypi.org `__ . - -Step 5. Test the Installation ------------------------------- - -To verify the package is properly installed, run the command below (this may take a few seconds): - -.. code-block:: sh - - mo -h - -You will see the help message for ``mo`` if installation finished successfully. If you get an -error, refer to the :doc:`Troubleshooting Guide <../../get-started/troubleshooting-install-config>` -for possible solutions. - -Congratulations! You finished installing OpenVINO Development Tools with C/C++ capability. -Now you can start exploring OpenVINO's functionality through example C/C++ applications. -See the "What's Next?" section to learn more! - -What's Next? -############ - -Learn more about OpenVINO and use it in your own application by trying out some of these examples! - -Get started with Python -+++++++++++++++++++++++ - -.. image:: ../../assets/images/get_started_with_python.gif - :width: 400 - -Try the `Python Quick Start Example <../../notebooks/vision-monodepth-with-output.html>`__ -to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook -inside your web browser. - -Visit the :doc:`Tutorials <../../learn-openvino/interactive-tutorials-python>` page for more -Jupyter Notebooks to get you started with OpenVINO, such as: - -* `OpenVINO Python API Tutorial <../../notebooks/openvino-api-with-output.html>`__ -* `Basic image classification program with Hello Image Classification <../../notebooks/hello-world-with-output.html>`__ -* `Convert a PyTorch model and use it for image background removal <../../notebooks/vision-background-removal-with-output.html>`__ - -Get started with C++ -++++++++++++++++++++ - -.. image:: ../../assets/images/get_started_with_cpp.jpg - :width: 400 - - -Try the :doc:`C++ Quick Start Example <../../learn-openvino/openvino-samples/get-started-demos>` -for step-by-step instructions on building and running a basic image classification C++ application. - -Visit the :doc:`Samples <../../learn-openvino/openvino-samples>` page for other C++ -example applications to get you started with OpenVINO, such as: - -* :doc:`Basic object detection with the Hello Reshape SSD C++ sample <../../learn-openvino/openvino-samples/hello-reshape-ssd>` -* :doc:`Object classification sample <../../learn-openvino/openvino-samples/hello-classification>` - -Learn OpenVINO Development Tools -++++++++++++++++++++++++++++++++ - -* Explore a variety of pre-trained deep learning models in the - :doc:`Open Model Zoo ` and deploy them in demo applications to see how they work. - - .. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. - -* Want to import a model from another framework and optimize its performance with OpenVINO? - Visit the :doc:`Convert a Model ` page. -* Accelerate your model's speed even further with quantization and other compression techniques - using :doc:`Neural Network Compression Framework (NNCF) <../../openvino-workflow/model-optimization-guide/quantizing-models-post-training>`. -* Benchmark your model's inference speed with one simple command using the - :doc:`Benchmark Tool <../../learn-openvino/openvino-samples/benchmark-tool>`. - -Additional Resources -#################### - -- `Intel® Distribution of OpenVINO™ toolkit home page `__ diff --git a/docs/articles_en/documentation/legacy-features/model-zoo.rst b/docs/articles_en/documentation/legacy-features/model-zoo.rst deleted file mode 100644 index 4b761e6c7df831..00000000000000 --- a/docs/articles_en/documentation/legacy-features/model-zoo.rst +++ /dev/null @@ -1,31 +0,0 @@ -Model Zoo -========= - -.. _model zoo: - -.. note:: - - Since the deprecation of Open Model Zoo, OpenVINO has significantly extended its presence on the - `Hugging Face `__ model repository. It is currently - the recommended source of optimized OpenVINO IR models. - -Open Model Zoo for OpenVINO™ toolkit delivers a wide variety of free, pre-trained deep learning -models and demo applications that provide full application templates to help you implement deep -learning in Python, C++, or OpenCV Graph API (G-API). - -Models, demos and full documentation are available in the -`Open Model Zoo GitHub repo `__ -and licensed under Apache License Version 2.0. - -Browse through over 200 neural network models, both -`public `__ and from -`Intel `__, and pick the right one for your solution. -Types include object detection, classification, image segmentation, handwriting recognition, -text to speech, pose estimation, and others. The Intel models have already been converted -to work with OpenVINO™ toolkit, while public models can easily be converted using the -:doc:`OpenVINO Model Conversion API <../../openvino-workflow/model-preparation>` utility. - -Open Model Zoo offers a -`comprehensive set of demos `__ that you can adapt for implementing specific deep -learning scenarios in your applications. - diff --git a/docs/articles_en/documentation/legacy-features/multi-device.rst b/docs/articles_en/documentation/legacy-features/multi-device.rst deleted file mode 100644 index 594f496287d714..00000000000000 --- a/docs/articles_en/documentation/legacy-features/multi-device.rst +++ /dev/null @@ -1,155 +0,0 @@ -Multi-device execution -====================== - - -.. meta:: - :description: The Multi-Device execution mode in OpenVINO Runtime assigns - multiple available computing devices to particular inference - requests to execute in parallel. - -.. danger:: - - The Multi-device execution mode described here has been **deprecated**. - - It's functionality is now fully covered by the :ref:`CUMULATIVE_THROUGHPUT ` - option of the :doc:`Automatic Device Selection <../../openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection>` mode. - This way, all available devices in the system can be used without the need to specify them. - -How MULTI Works -#################### - -The Multi-Device execution mode, or MULTI for short, acts as a "virtual" or a "proxy" device, which does not bind to a specific type of hardware. Instead, it assigns available computing devices to particular inference requests, which are then executed in parallel. - -The potential gains from using Multi-Device execution are: - -* improved throughput from using multiple devices at once, -* increase in performance stability due to multiple devices sharing inference workload. - -Importantly, the Multi-Device mode does not change the application logic, so it does not require you to explicitly compile the model on every device or create and balance inference requests. It appears to use a typical device but internally handles the actual hardware. - -Note that the performance increase in this mode comes from utilizing multiple devices at once. This means that you need to provide the devices with enough inference requests to keep them busy, otherwise you will not benefit much from using MULTI. - - -Using the Multi-Device Mode -########################### - -Following the OpenVINO™ naming convention, the Multi-Device mode is assigned the label of “MULTI.” The only configuration option available for it is a prioritized list of devices to use: - - -+----------------------------+---------------------------------+------------------------------------------------------------+ -| Property | Property values | Description | -+============================+=================================+============================================================+ -| | | MULTI: | | Specifies the devices available for selection. | -| | | comma-separated, no spaces | | The device sequence will be taken as priority | -+----------------------------+---------------------------------+ | from high to low. | -| ``ov::device::priorities`` | | device names | | Priorities can be set directly as a string. | -| | | comma-separated, no spaces | | -+----------------------------+---------------------------------+------------------------------------------------------------+ - - -Specifying the device list explicitly is required by MULTI, as it defines the devices available for inference and sets their priorities. - -Note that OpenVINO™ Runtime enables you to use “GPU” as an alias for “GPU.0” in function calls. -More details on enumerating devices can be found in :doc:`Inference Devices and Modes <../../openvino-workflow/running-inference/inference-devices-and-modes>`. - -The following commands are accepted by the API: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_multi.py - :language: python - :fragment: [MULTI_0] - - .. tab-item:: C++ - :sync: cpp - - .. doxygensnippet:: docs/articles_en/assets/snippets/MULTI0.cpp - :language: cpp - :fragment: [part0] - - -To check what devices are present in the system, you can use the Device API. For information on how to do it, check :doc:`Query device properties and configuration <../../openvino-workflow/running-inference/inference-devices-and-modes/query-device-properties>`. - - -Configuring Individual Devices and Creating the Multi-Device On Top -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -As mentioned previously, executing inference with MULTI may be set up by configuring individual devices before creating the "MULTI" device on top. It may be considered for performance reasons. - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_multi.py - :language: python - :fragment: [MULTI_4] - - .. tab-item:: C++ - :sync: cpp - - .. doxygensnippet:: docs/articles_en/assets/snippets/MULTI4.cpp - :language: cpp - :fragment: [part4] - - -Alternatively, you can combine all the individual device settings into a single config file and load it for MULTI to parse. See the code example in the next section. - -Querying the Optimal Number of Inference Requests -+++++++++++++++++++++++++++++++++++++++++++++++++ - -When using MULTI, you don't need to sum over included devices yourself, you can query the optimal number of requests directly, -using the :doc:`configure devices <../../openvino-workflow/running-inference/inference-devices-and-modes/query-device-properties>` property: - -.. tab-set:: - - .. tab-item:: C++ - - .. doxygensnippet:: docs/articles_en/assets/snippets/MULTI5.cpp - :language: cpp - :fragment: [part5] - - -Using the Multi-Device with OpenVINO Samples and Benchmarking Performance -######################################################################### - -To see how the Multi-Device execution is used in practice and test its performance, take a look at OpenVINO's Benchmark Application which presents the optimal performance of the plugin without the need for additional settings, like the number of requests or CPU threads. -Here is an example command to evaluate performance of CPU + GPU: - -.. code-block:: sh - - ./benchmark_app –d MULTI:CPU,GPU –m -i -niter 1000 - - -For more information, refer to the :doc:`Benchmark Tool <../../../learn-openvino/openvino-samples/benchmark-tool>` article. - - -.. note:: - - You can keep using the FP16 IR without converting it to FP32, even if some of the listed devices do not support it. The conversion will be done automatically for you. - - No demos are yet fully optimized for MULTI, by means of supporting the ``ov::optimal_number_of_infer_requests`` property, using the GPU streams/throttling, and so on. - - -Performance Considerations for the Multi-Device Execution -######################################################### - -For best performance when using the MULTI execution mode you should consider a few recommendations: - -- MULTI usually performs best when the fastest device is specified first in the device candidate list. This is particularly important when the request-level parallelism is not sufficient (e.g. the number of requests is not enough to saturate all devices). -- Just like with any throughput-oriented execution mode, it is highly recommended to query the optimal number of inference requests directly from the instance of the ``ov:compiled_model``. Refer to the code of the previously mentioned ``benchmark_app`` for more details. -- Execution on certain device combinations, for example CPU+GPU, performs better with certain knobs. Refer to the ``benchmark_app`` code for details. One specific example is disabling GPU driver polling, which in turn requires multiple GPU streams to balance out slower communication of inference completion from the device to the host. -- The MULTI logic always attempts to save on copying data between device-agnostic and user-facing inference requests, and device-specific 'worker' requests that are being actually scheduled behind the scene. To facilitate the copy savings, it is recommended to run the requests in the order in which they were created. -- While performance of accelerators combines well with MULTI, the CPU+GPU execution may introduce certain performance issues. It is due to the devices sharing some resources, like power or bandwidth. Enabling the GPU throttling hint, which saves a CPU thread for CPU inference, is an example of a recommended solution addressing this issue. - - -Additional Resources -#################### - -- :doc:`Inference Devices and Modes <../../openvino-workflow/running-inference/inference-devices-and-modes>` -- :doc:`Automatic Device Selection <../../openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection>` - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api.rst deleted file mode 100644 index e031c10e7e4e08..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api.rst +++ /dev/null @@ -1,863 +0,0 @@ -Transition from Legacy Conversion API -===================================== - - -.. meta:: - :description: Transition guide from MO / mo.convert_model() to OVC / ov.convert_model(). - -.. toctree:: - :maxdepth: 1 - :hidden: - - transition-legacy-conversion-api/legacy-conversion-api - transition-legacy-conversion-api/legacy-model-optimizer-extensibility - -In the 2023.1 OpenVINO release OpenVINO Model Converter was introduced with the corresponding -Python API: ``openvino.convert_model`` method. ``ovc`` and ``openvino.convert_model`` represent -a lightweight alternative of ``mo`` and ``openvino.tools.mo.convert_model`` which are considered -legacy API now. In this article, all the differences between ``mo`` and ``ovc`` are summarized -and the transition guide from the legacy API to the new API is provided. - -Parameters Comparison -##################### - -The comparison of parameters between ov.convert_model() / OVC and mo.convert_model() / MO. - -.. list-table:: - :widths: 20 25 55 - :header-rows: 1 - - * - mo.convert_model() / MO - - ov.convert_model() / OVC - - Differences description - * - input_model - - input_model - - Along with model object or path to input model ov.convert_model() accepts list of model parts, for example, the path to TensorFlow weights plus the path to TensorFlow checkpoint. OVC tool accepts an unnamed input model. - * - output_dir - - output_model - - output_model in OVC tool sets both output model name and output directory. - * - model_name - - output_model - - output_model in OVC tool sets both output model name and output directory. - * - input - - input - - ov.convert_model() accepts tuples for setting multiple parameters. OVC tool 'input' does not have type setting and freezing functionality. ov.convert_model() does not allow input cut. - * - output - - output - - ov.convert_model() does not allow output cut. - * - input_shape - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by ``input`` parameter. - * - example_input - - example_input - - No differences. - * - batch - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by model reshape functionality. See details below. - * - mean_values - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - scale_values - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - scale - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - reverse_input_channels - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - source_layout - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - target_layout - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - layout - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - compress_to_fp16 - - compress_to_fp16 - - OVC provides 'compress_to_fp16' for command line tool only, as compression is performed during saving a model to IR (Intermediate Representation). - * - extensions - - extension - - No differences. - * - transform - - N/A - - Not available in ov.convert_model() / OVC. Can be replaced by functionality from ``PrePostProcessor``. See details below. - * - transformations_config - - N/A - - Not available in ov.convert_model() / OVC. - * - static_shape - - N/A - - Not available in ov.convert_model() / OVC. - * - freeze_placeholder_with_value - - N/A - - Not available in ov.convert_model() / OVC. - * - use_legacy_frontend - - N/A - - Not available in ov.convert_model() / OVC. - * - use_legacy_frontend - - N/A - - Not available in ov.convert_model() / OVC. - * - silent - - verbose - - OVC / ov.convert_model provides 'verbose' parameter instead of 'silent' for printing of detailed conversion information if 'verbose' is set to True. - * - log_level - - N/A - - Not available in ov.convert_model() / OVC. - * - version - - version - - N/A - * - progress - - N/A - - Not available in ov.convert_model() / OVC. - * - stream_output - - N/A - - Not available in ov.convert_model() / OVC. - * - share_weights - - share_weights - - No differences. - * - framework - - N/A - - Not available in ov.convert_model() / OVC. - * - help / -h - - help / -h - - OVC provides help parameter only in command line tool. - * - example_output - - output - - OVC / ov.convert_model 'output' parameter includes capabilities of MO 'example_output' parameter. - * - input_model_is_text - - N/A - - Not available in ov.convert_model() / OVC. - * - input_checkpoint - - input_model - - All supported model formats can be passed to 'input_model'. - * - input_meta_graph - - input_model - - All supported model formats can be passed to 'input_model'. - * - saved_model_dir - - input_model - - All supported model formats can be passed to 'input_model'. - * - saved_model_tags - - N/A - - Not available in ov.convert_model() / OVC. - * - tensorflow_custom_operations_config_update - - N/A - - Not available in ov.convert_model() / OVC. - * - tensorflow_object_detection_api_pipeline_config - - N/A - - Not available in ov.convert_model() / OVC. - * - tensorboard_logdir - - N/A - - Not available in ov.convert_model() / OVC. - * - tensorflow_custom_layer_libraries - - N/A - - Not available in ov.convert_model() / OVC. - * - input_symbol - - N/A - - Not available in ov.convert_model() / OVC. - * - nd_prefix_name - - N/A - - Not available in ov.convert_model() / OVC. - * - pretrained_model_name - - N/A - - Not available in ov.convert_model() / OVC. - * - save_params_from_nd - - N/A - - Not available in ov.convert_model() / OVC. - * - legacy_mxnet_model - - N/A - - Not available in ov.convert_model() / OVC. - * - enable_ssd_gluoncv - - N/A - - Not available in ov.convert_model() / OVC. - * - input_proto - - N/A - - Not available in ov.convert_model() / OVC. - * - caffe_parser_path - - N/A - - Not available in ov.convert_model() / OVC. - * - k - - N/A - - Not available in ov.convert_model() / OVC. - * - disable_omitting_optional - - N/A - - Not available in ov.convert_model() / OVC. - * - enable_flattening_nested_params - - N/A - - Not available in ov.convert_model() / OVC. - * - counts - - N/A - - Not available in ov.convert_model() / OVC. - * - remove_output_softmax - - N/A - - Not available in ov.convert_model() / OVC. - * - remove_memory - - N/A - - Not available in ov.convert_model() / OVC. - -Transition from Legacy API to New API -############################################################################ - -mo.convert_model() provides a wide range of preprocessing parameters. Most of these parameters have analogs in OVC or can be replaced with functionality from ``ov.PrePostProcessor`` class. -Here is the guide to transition from legacy model preprocessing to new API preprocessing. - - -``input_shape`` -################ - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - from openvino.tools import mo - - ov_model = mo.convert_model(model, input_shape=[[1, 3, 100, 100],[1]]) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model, input=[[1, 3, 100, 100],[1]]) - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --input_shape [1,3,100,100],[1] --output_dir OUTPUT_DIR - - - .. code-block:: sh - :force: - - ovc MODEL_NAME --input [1,3,100,100],[1] --output_model OUTPUT_MODEL - -``batch`` -########## - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - from openvino.tools import mo - - ov_model = mo.convert_model(model, batch=2) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model) - input_shape = ov_model.inputs[0].partial_shape - input_shape[0] = 2 # batch size - ov_model.reshape(input_shape) - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --batch 2 --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -``mean_values`` -################ - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - from openvino.tools import mo - - ov_model = mo.convert_model(model, mean_values=[0.5, 0.5, 0.5]) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input(input_name).tensor().set_layout(ov.Layout("NHWC")) - prep.input(input_name).preprocess().mean([0.5, 0.5, 0.5]) - ov_model = prep.build() - - There is currently no heuristic for automatic detection of the channel to which mean, scale or reverse channels should be applied. ``Layout`` needs to be explicitly specified with "C" channel. For example "NHWC", "NCHW", "?C??". See also :doc:`Layout API overview <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview>`. - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --mean_values [0.5,0.5,0.5] --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -``scale_values`` -################# - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - from openvino.tools import mo - - ov_model = mo.convert_model(model, scale_values=[255., 255., 255.]) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input(input_name).tensor().set_layout(ov.Layout("NHWC")) - prep.input(input_name).preprocess().scale([255., 255., 255.]) - ov_model = prep.build() - - There is currently no heuristic for automatic detection of the channel to which mean, scale or reverse channels should be applied. ``Layout`` needs to be explicitly specified with "C" channel. For example "NHWC", "NCHW", "?C??". See also :doc:`Layout API overview <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview>`. - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --scale_values [255,255,255] --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -``reverse_input_channels`` -########################### - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - from openvino.tools import mo - - ov_model = mo.convert_model(model, reverse_input_channels=True) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input(input_name).tensor().set_layout(ov.Layout("NHWC")) - prep.input(input_name).preprocess().reverse_channels() - ov_model = prep.build() - - There is currently no heuristic for automatic detection of the channel to which mean, scale or reverse channels should be applied. ``Layout`` needs to be explicitly specified with "C" channel. For example "NHWC", "NCHW", "?C??". See also :doc:`Layout API overview <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview>`. - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --reverse_input_channels --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -``source_layout`` -################## - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - import openvino as ov - from openvino.tools import mo - - ov_model = mo.convert_model(model, source_layout={input_name: ov.Layout("NHWC")}) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input(input_name).model().set_layout(ov.Layout("NHWC")) - ov_model = prep.build() - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --source_layout input_name(NHWC) --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -``target_layout`` -################## - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - import openvino as ov - from openvino.tools import mo - - ov_model = mo.convert_model(model, target_layout={input_name: ov.Layout("NHWC")}) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input(input_name).tensor().set_layout(ov.Layout("NHWC")) - ov_model = prep.build() - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --target_layout input_name(NHWC) --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -``layout`` -########### - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - from openvino.tools import mo - - ov_model = mo.convert_model(model, layout={input_name: mo.LayoutMap("NCHW", "NHWC")}) - - - .. code-block:: py - :force: - - import openvino as ov - - ov_model = ov.convert_model(model) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input(input_name).model().set_layout(ov.Layout("NCHW")) - prep.input(input_name).tensor().set_layout(ov.Layout("NHWC")) - ov_model = prep.build() - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --layout "input_name(NCHW->NHWC)" --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -``transform`` -############## - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: py - :force: - - from openvino.tools import mo - - ov_model = mo.convert_model(model, transform=[('LowLatency2', {'use_const_initializer': False}), 'Pruning', ('MakeStateful', {'param_res_names': {'input_name': 'output_name'}})]) - - - .. code-block:: py - :force: - - import openvino as ov - from openvino._offline_transformations import apply_low_latency_transformation, apply_pruning_transformation, apply_make_stateful_transformation - - ov_model = ov.convert_model(model) - apply_low_latency_transformation(model, use_const_initializer=False) - apply_pruning_transformation(model) - apply_make_stateful_transformation(model, param_res_names={'input_name': 'output_name'}) - - .. tab-item:: CLI - :sync: cli - - .. list-table:: - :header-rows: 1 - - * - Legacy API - - New API - * - .. code-block:: sh - :force: - - mo --input_model MODEL_NAME --transform LowLatency2[use_const_initializer=False],Pruning,MakeStateful[param_res_names={'input_name':'output_name'}] --output_dir OUTPUT_DIR - - - Not available in OVC tool. Switch to the **Python** tab. - -Cutting Off Parts of a Model -############################ - -Performing surgery by cutting model inputs and outputs from a model is no longer available in the new conversion API. Instead, we recommend performing the cut in the original framework. -Below are examples of model cutting of TensorFlow protobuf, TensorFlow SavedModel, and ONNX formats with the legacy conversion API, compared to achieving the same cut with tools provided by the Tensorflow and ONNX frameworks. -For PyTorch, TensorFlow 2 Keras, and PaddlePaddle, we recommend changing the original model code to perform the model cut. - -Note: This guide does not cover the cutting a model by input port of an operation that MO tool provides using `input` and `output` options, for example, `--input 1:name_op`. - -``PyTorch`` -########### - -Model cut for PyTorch is not available in legacy API. - -When it is needed to remove a whole module from the model it is possible to replace such modules with `Identity`. Below is the example of removing `conv1` and `bn1` modules at the input and `fc` module at the output of the resnet50 model. - -.. code-block:: py - :force: - - import openvino as ov - import torch - import torchvision - from torch.nn import Identity - - # Load pretrained model - model = torchvision.models.resnet50(weights='DEFAULT') - - # input cut - model.conv1 = Identity() - model.bn1 = Identity() - - # output cut - model.fc = Identity() - - # convert and compile the model - ov_model = ov.convert_model(model, input=([-1,64,-1,-1], torch.float32)) - compiled_model = ov.compile_model(ov_model) - -When it is needed to remove one or more outputs from the model it is possible to create a wrapper for the model and only output the needed output. Below is the example of removing second output from the model. - -.. code-block:: py - :force: - - import openvino as ov - import torch - - # Example of model with multiple outputs - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.linear1 = torch.nn.Linear(100, 200) - self.activation1 = torch.nn.ReLU() - self.linear2 = torch.nn.Linear(200, 10) - self.activation2 = torch.nn.Sigmoid() - - def forward(self, x): - x = self.linear1(x) - x = self.activation1(x) - y = self.linear2(x) - y = self.activation2(y) - return x, y - - # New model, where some outputs are cut - class CutModel(torch.nn.Module): - def __init__(self): - super(CutModel, self).__init__() - self.model = Model() - - def forward(self, x): - - # get first output - x, _ = self.model(x) - - return x - - # Model with output cut - cut_model = CutModel() - - # convert and compile the model - ov_model = ov.convert_model(cut_model, input=([-1,-1,-1], torch.float32)) - compiled_model = ov.compile_model(ov_model) - - -``TensorFlow protobuf format / tf.Graph / tf.GraphDef`` -####################################################### - -Legacy API. - -.. code-block:: py - :force: - - import openvino as ov - import openvino.tools.mo as mo - - import tensorflow as tf - - def load_graph(model_path): - graph_def = tf.compat.v1.GraphDef() - with open(model_path, "rb") as f: - graph_def.ParseFromString(f.read()) - with tf.compat.v1.Graph().as_default() as graph: - tf.graph_util.import_graph_def(graph_def, name="") - return graph - - # Load TF model - graph = load_graph("/path_to_model/HugeCTR.pb") - - # Convert the model with input and output cut - input_name = "concat" - output_name = "MatVec_3/Squeeze" - ov_model = mo.convert_model(graph, input=(input_name, [-1, -1]), output=output_name) - - # Compile the model - compiled_model = ov.compile_model(ov_model) - -Model cut in original FW. - -.. code-block:: py - :force: - - import openvino as ov - import tensorflow as tf - - from tensorflow.python.tools.strip_unused_lib import strip_unused - - def load_graph(model_path): - graph_def = tf.compat.v1.GraphDef() - with open(model_path, "rb") as f: - graph_def.ParseFromString(f.read()) - with tf.compat.v1.Graph().as_default() as graph: - tf.graph_util.import_graph_def(graph_def, name="") - return graph - - # Load TF model - graph = load_graph("/path_to_model/HugeCTR.pb") - - # Cut the model - input_name = "concat" - output_name = "MatVec_3/Squeeze" - graph_def = graph.as_graph_def() - new_graph_def = strip_unused(graph_def, [input_name], [output_name], tf.float32.as_datatype_enum) - - # Convert and compile model - ov_model = ov.convert_model(new_graph_def, input=[-1, -1]) - cmp_model = ov.compile_model(ov_model) - - -``TensorFlow SavedModel format`` -################################ - -Model cut for SavedModel format is not available in legacy API. - -Example of model cut in original FW. - -.. code-block:: py - :force: - - import openvino as ov - import tensorflow_hub as hub - - import tensorflow as tf - from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 - from tensorflow.python.tools.strip_unused_lib import strip_unused - - # Load TF model - model = hub.load("https://tfhub.dev/svampeatlas/vision/embedder/fungi_V2/1?tf-hub-format=compressed") - - # Convert model to GraphDef - model_func = model.signatures["default"] - frozen_func = convert_variables_to_constants_v2(model_func) - graph_def = frozen_func.graph.as_graph_def() - - # Cut the model - input_name = 'InceptionV4/InceptionV4/Conv2d_2b_3x3/Relu' - output_name = 'InceptionV4/InceptionV4/Mixed_7c/concat' - new_graph_def = strip_unused(graph_def, [input_name], [output_name], tf.float32.as_datatype_enum) - - # Convert and compile the model - ov_model = ov.convert_model(new_graph_def) - compiled_model = ov.compile_model(ov_model) - - -``ONNX`` -######## - - -Legacy API. - -.. code-block:: py - :force: - - import openvino as ov - import openvino.tools.mo as mo - - input_path = "/path_to_model/yolov8x.onnx" - - # Convert model and perform input and output cut - input_name = "/model.2/Concat_output_0" - output_name = "/model.22/Concat_3_output_0" - ov_model = mo.convert_model(input_path, input=input_name, output=output_name) - - # Compile model - ov.compile_model(ov_model) - -Model cut in original FW. - -.. code-block:: py - :force: - - import onnx - import openvino as ov - - input_path = "/path_to_model/yolov8x.onnx" - - # Cut the model - input_name = "/model.2/Concat_output_0" - output_name = "/model.22/Concat_3_output_0" - cut_model_path = "/path_to_model/yolov8x_cut.onnx" - onnx.utils.extract_model(input_path, cut_model_path, [input_name], [output_name]) - - # Convert model - ov_model = ov.convert_model(cut_model_path) - - # Compile model - ov.compile_model(ov_model) - - -Supported Frameworks in MO vs OVC -################################# - -ov.convert_model() and OVC tool support conversion from PyTorch, TF, TF Lite, ONNX, PaddlePaddle. - - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api.rst deleted file mode 100644 index 5302c7912995f6..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api.rst +++ /dev/null @@ -1,188 +0,0 @@ -Legacy Conversion API -===================== - - -.. toctree:: - :maxdepth: 1 - :hidden: - - Setting Input Shapes - Troubleshooting Reshape Errors - Cutting Off Parts of a Model - Embedding Preprocessing Computation - Compressing a Model to FP16 - Convert Models Represented as Python Objects - Model Optimizer Frequently Asked Questions - Supported Model Formats - -.. meta:: - :description: Model conversion (MO) furthers the transition between training and - deployment environments, it adjusts deep learning models for - optimal execution on target devices. - -.. note:: - This part of the documentation describes a legacy approach to model conversion. Starting with OpenVINO 2023.1, a simpler alternative API for model conversion is available: ``openvino.convert_model`` and OpenVINO Model Converter ``ovc`` CLI tool. Refer to :doc:`Model preparation <../../../openvino-workflow/model-preparation>` for more details. If you are still using `openvino.tools.mo.convert_model` or `mo` CLI tool, you can still refer to this documentation. However, consider checking the :doc:`transition guide <../transition-legacy-conversion-api>` to learn how to migrate from the legacy conversion API to the new one. Depending on the model topology, the new API can be a better option for you. - -To convert a model to OpenVINO model format (``ov.Model``), you can use the following command: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model(INPUT_MODEL) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model INPUT_MODEL - - -If the out-of-the-box conversion (only the ``input_model`` parameter is specified) is not successful, use the parameters mentioned below to override input shapes and cut the model: - -- ``input`` and ``input_shape`` - the model conversion API parameters used to override original input shapes for model conversion, - - For more information about the parameters, refer to the :doc:`Setting Input Shapes ` guide. - -- ``input`` and ``output`` - the model conversion API parameters used to define new inputs and outputs of the converted model to cut off unwanted parts (such as unsupported operations and training sub-graphs), - - For a more detailed description, refer to the :doc:`Cutting Off Parts of a Model ` guide. - -- ``mean_values``, ``scales_values``, ``layout`` - the parameters used to insert additional input pre-processing sub-graphs into the converted model, - - For more details, see the :doc:`Embedding Preprocessing Computation ` article. - -- ``compress_to_fp16`` - a compression parameter in ``mo`` command-line tool, which allows generating IR with constants (for example, weights for convolutions and matrix multiplications) compressed to ``FP16`` data type. - - For more details, refer to the :doc:`Compression of a Model to FP16 ` guide. - -To get the full list of conversion parameters, run the following command: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model(help=True) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --help - - -Examples of model conversion parameters -####################################### - -Below is a list of separate examples for different frameworks and model conversion parameters: - -1. Launch model conversion for a TensorFlow MobileNet model in the binary protobuf format: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("MobileNet.pb") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model MobileNet.pb - - - Launch model conversion for a TensorFlow BERT model in the SavedModel format with three inputs. Specify input shapes explicitly where the batch size and the sequence length equal 2 and 30 respectively: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("BERT", input_shape=[[2,30],[2,30],[2,30]]) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --saved_model_dir BERT --input_shape [2,30],[2,30],[2,30] - - - For more information, refer to the :doc:`Converting a TensorFlow Model ` guide. - -2. Launch model conversion for an ONNX OCR model and specify new output explicitly: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("ocr.onnx", output="probabilities") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model ocr.onnx --output probabilities - - - For more information, refer to the :doc:`Converting an ONNX Model ` guide. - - .. note:: - - PyTorch models must be exported to the ONNX format before conversion into IR. More information can be found in :doc:`Converting a PyTorch Model `. - -3. Launch model conversion for a PaddlePaddle UNet model and apply mean-scale normalization to the input: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("unet.pdmodel", mean_values=[123,117,104], scale=255) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model unet.pdmodel --mean_values [123,117,104] --scale 255 - - - For more information, refer to the :doc:`Converting a PaddlePaddle Model ` guide. - -- To get conversion recipes for specific TensorFlow, ONNX, and PyTorch models, refer to the :doc:`Model Conversion Tutorials `. -- For more information about IR, see :doc:`Deep Learning Network Intermediate Representation and Operation Sets in OpenVINO™ <../../openvino-ir-format/operation-sets>`. -- For more information about support of neural network models trained with various frameworks, see :doc:`OpenVINO Extensibility Mechanism <../../openvino-extensibility>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-compressing-model-to-fp16.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-compressing-model-to-fp16.rst deleted file mode 100644 index c9e93036a3a7c2..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-compressing-model-to-fp16.rst +++ /dev/null @@ -1,53 +0,0 @@ -[LEGACY] Compressing a Model to FP16 -============================================= - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Conversion Parameters <../../../../openvino-workflow/model-preparation/conversion-parameters>` article. - -By default, when IR is saved all relevant floating-point weights are compressed to ``FP16`` data type during model conversion. -It results in creating a "compressed ``FP16`` model", which occupies about half of -the original space in the file system. The compression may introduce a minor drop in accuracy, -but it is negligible for most models. -In case if accuracy drop is significant user can disable compression explicitly. - -To disable compression, use the ``compress_to_fp16=False`` option: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.runtime import save_model - ov_model = save_model(INPUT_MODEL, compress_to_fp16=False) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model INPUT_MODEL --compress_to_fp16=False - - -For details on how plugins handle compressed ``FP16`` models, see -:doc:`Inference Devices and Modes <../../../../openvino-workflow/running-inference/inference-devices-and-modes>`. - -.. note:: - - ``FP16`` compression is sometimes used as the initial step for ``INT8`` quantization. - Refer to the :doc:`Post-training optimization <../../../../openvino-workflow/model-optimization-guide/quantizing-models-post-training>` guide for more - information about that. - - -.. note:: - - Some large models (larger than a few GB) when compressed to ``FP16`` may consume an overly large amount of RAM on the loading - phase of the inference. If that is the case for your model, try to convert it without compression: - ``convert_model(INPUT_MODEL, compress_to_fp16=False)`` or ``convert_model(INPUT_MODEL)`` - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-convert-models-as-python-objects.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-convert-models-as-python-objects.rst deleted file mode 100644 index 4921dc6bfa221f..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-convert-models-as-python-objects.rst +++ /dev/null @@ -1,150 +0,0 @@ -[LEGACY] Convert Models Represented as Python Objects -============================================================= - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Model Preparation <../../../../openvino-workflow/model-preparation>` article. - -Model conversion API is represented by ``convert_model()`` method in openvino.tools.mo namespace. ``convert_model()`` is compatible with types from openvino.runtime, like PartialShape, Layout, Type, etc. - -``convert_model()`` has the ability available from the command-line tool, plus the ability to pass Python model objects, such as a PyTorch model or TensorFlow Keras model directly, without saving them into files and without leaving the training environment (Jupyter Notebook or training scripts). In addition to input models consumed directly from Python, ``convert_model`` can take OpenVINO extension objects constructed directly in Python for easier conversion of operations that are not supported in OpenVINO. - -.. note:: - - Model conversion can be performed only when you install - :doc:`the development tools <../../../legacy-features/install-dev-tools>`, which provide - both the ``convert_model()`` method and ``mo`` command-line tool. - The functionality from this article is applicable for ``convert_model()`` only and it is - not present in command-line tool. - - -``convert_model()`` returns an openvino.runtime.Model object which can be compiled and inferred or serialized to IR. - -Example of converting a PyTorch model directly from memory: - -.. code-block:: py - :force: - - import torchvision - from openvino.tools.mo import convert_model - - model = torchvision.models.resnet50(weights='DEFAULT') - ov_model = convert_model(model) - -The following types are supported as an input model for ``convert_model()``: - -* PyTorch - ``torch.nn.Module``, ``torch.jit.ScriptModule``, ``torch.jit.ScriptFunction``. Refer to the :doc:`Converting a PyTorch Model <[legacy]-supported-model-formats/[legacy]-convert-pytorch>` article for more details. -* TensorFlow / TensorFlow 2 / Keras - ``tf.keras.Model``, ``tf.keras.layers.Layer``, ``tf.compat.v1.Graph``, ``tf.compat.v1.GraphDef``, ``tf.Module``, ``tf.function``, ``tf.compat.v1.session``, ``tf.train.checkpoint``. Refer to the :doc:`Converting a TensorFlow Model <[legacy]-supported-model-formats/[legacy]-convert-tensorflow>` article for more details. - -``convert_model()`` accepts all parameters available in the MO command-line tool. Parameters can be specified by Python classes or string analogs, similar to the command-line tool. - -Example of using native Python classes to set ``input_shape``, ``mean_values`` and ``layout``: - -.. code-block:: py - :force: - - from openvino.runtime import PartialShape, Layout - from openvino.tools.mo import convert_model - - ov_model = convert_model(model, input_shape=PartialShape([1,3,100,100]), mean_values=[127, 127, 127], layout=Layout("NCHW")) - -Example of using strings for setting ``input_shape``, ``mean_values`` and ``layout``: - -.. code-block:: py - :force: - - from openvino.runtime import Layout - from openvino.tools.mo import convert_model - - ov_model = convert_model(model, input_shape="[1,3,100,100]", mean_values="[127,127,127]", layout="NCHW") - - -The ``input`` parameter can be set by a ``tuple`` with a name, shape, and type. The input name of the type string is required in the tuple. The shape and type are optional. -The shape can be a ``list`` or ``tuple`` of dimensions (``int`` or ``openvino.runtime.Dimension``), or ``openvino.runtime.PartialShape``, or ``openvino.runtime.Shape``. The type can be of numpy type or ``openvino.runtime.Type``. - -Example of using a tuple in the ``input`` parameter to cut a model: - -.. code-block:: py - :force: - - from openvino.tools.mo import convert_model - - ov_model = convert_model(model, input=("input_name", [3], np.float32)) - -For complex cases, when a value needs to be set in the ``input`` parameter, the ``InputCutInfo`` class can be used. ``InputCutInfo`` accepts four parameters: ``name``, ``shape``, ``type``, and ``value``. - -``InputCutInfo("input_name", [3], np.float32, [0.5, 2.1, 3.4])`` is equivalent of ``InputCutInfo(name="input_name", shape=[3], type=np.float32, value=[0.5, 2.1, 3.4])``. - -Supported types for ``InputCutInfo``: - -* name: ``string``. -* shape: ``list`` or ``tuple`` of dimensions (``int`` or ``openvino.runtime.Dimension``), ``openvino.runtime.PartialShape``, ``openvino.runtime.Shape``. -* type: ``numpy type``, ``openvino.runtime.Type``. -* value: ``numpy.ndarray``, ``list`` of numeric values, ``bool``. - -Example of using ``InputCutInfo`` to freeze an input with value: - -.. code-block:: py - :force: - - from openvino.tools.mo import convert_model, InputCutInfo - - ov_model = convert_model(model, input=InputCutInfo("input_name", [3], np.float32, [0.5, 2.1, 3.4])) - -To set parameters for models with multiple inputs, use ``list`` of parameters. -Parameters supporting ``list``: - -* input -* input_shape -* layout -* source_layout -* dest_layout -* mean_values -* scale_values - -Example of using lists to set shapes, types and layout for multiple inputs: - -.. code-block:: py - :force: - - from openvino.runtime import Layout - from openvino.tools.mo import convert_model, LayoutMap - - ov_model = convert_model(model, input=[("input1", [1,3,100,100], np.float32), ("input2", [1,3,100,100], np.float32)], layout=[Layout("NCHW"), LayoutMap("NCHW", "NHWC")]) - -``layout``, ``source_layout`` and ``dest_layout`` accept an ``openvino.runtime.Layout`` object or ``string``. - -Example of using the ``Layout`` class to set the layout of a model input: - -.. code-block:: py - :force: - - from openvino.runtime import Layout - from openvino.tools.mo import convert_model - - ov_model = convert_model(model, source_layout=Layout("NCHW")) - -To set both source and destination layouts in the ``layout`` parameter, use the ``LayoutMap`` class. ``LayoutMap`` accepts two parameters: ``source_layout`` and ``target_layout``. - -``LayoutMap("NCHW", "NHWC")`` is equivalent to ``LayoutMap(source_layout="NCHW", target_layout="NHWC")``. - -Example of using the ``LayoutMap`` class to change the layout of a model input: - -.. code-block:: py - :force: - - from openvino.tools.mo import convert_model, LayoutMap - - ov_model = convert_model(model, layout=LayoutMap("NCHW", "NHWC")) - -Example of using the ``serialize`` method to save the converted model to OpenVINO IR: - -.. code-block:: py - :force: - - from openvino.runtime import serialize - - serialize(ov_model, "model.xml") - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst deleted file mode 100644 index 0406602a6e51fa..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-cutting-parts-of-a-model.rst +++ /dev/null @@ -1,585 +0,0 @@ -[LEGACY] Cutting Off Parts of a Model -================================================ - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - -Sometimes, it is necessary to remove parts of a model when converting it to OpenVINO IR. This chapter describes how to do it, using model conversion API parameters. Model cutting applies mostly to TensorFlow models, which is why TensorFlow will be used in this chapter's examples, but it may be also useful for other frameworks. - -Purpose of Model Cutting -######################## - -The following examples are the situations when model cutting is useful or even required: - -* A model has pre- or post-processing parts that cannot be translated to existing OpenVINO operations. -* A model has a training part that is convenient to be kept in the model but not used during inference. -* A model is too complex be converted at once, because it contains a lot of unsupported operations that cannot be easily implemented as custom layers. -* A problem occurs with model conversion or inference in OpenVINO™ Runtime. To identify the issue, limit the conversion scope by iterative search for problematic areas in the model. -* A single custom layer or a combination of custom layers is isolated for debugging purposes. - -.. note:: - - Internally, when you run model conversion API, it loads the model, goes through the topology, and tries to find each layer type in a list of known layers. Custom layers are layers that are not included in the list. If your topology contains such kind of layers, model conversion API classifies them as custom. - -Model conversion API parameters -############################### - -Model conversion API provides ``input`` and ``output`` command-line options to specify new entry and exit nodes, while ignoring the rest of the model: - -* ``input`` option accepts a list of layer names of the input model that should be treated as new entry points to the model. See the full list of accepted types for input on :doc:`Model Conversion Python API <[legacy]-convert-models-as-python-objects>` page. -* ``output`` option accepts a list of layer names of the input model that should be treated as new exit points from the model. - -The ``input`` option is required for cases unrelated to model cutting. For example, when the model contains several inputs and ``input_shape`` or ``mean_values`` options are used, the ``input`` option specifies the order of input nodes for correct mapping between multiple items provided in ``input_shape`` and ``mean_values`` and the inputs in the model. - -Model cutting is illustrated with the Inception V1 model, found in the ``models/research/slim`` repository. To proceed with this chapter, make sure you do the necessary steps to :doc:`prepare the model for model conversion <[legacy]-setting-input-shapes>`. - -Default Behavior without input and output -######################################### - -The input model is converted as a whole if neither ``input`` nor ``output`` command line options are used. All ``Placeholder`` operations in a TensorFlow graph are automatically identified as entry points. The ``Input`` layer type is generated for each of them. All nodes that have no consumers are automatically identified as exit points. - -For Inception_V1, there is one ``Placeholder``: input. If the model is viewed in TensorBoard, the input operation is easy to find: - -.. image:: ../../../../assets/images/inception_v1_std_input.svg - :alt: Placeholder in Inception V1 - -``Reshape`` is the only output operation, which is enclosed in a nested name scope of ``InceptionV1/Logits/Predictions``, under the full name of ``InceptionV1/Logits/Predictions/Reshape_1``. - -In TensorBoard, along with some of its predecessors, it looks as follows: - -.. image:: ../../../../assets/images/inception_v1_std_output.svg - :alt: TensorBoard with predecessors - -Convert this model to ``ov.Model``: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --output_dir - - -``ov.Model`` can be serialized with the ``ov.serialize()`` method to Intermediate Representation which can be used for model structure exploring. -In IR, the structure of a model has the following layers: - -.. code-block:: xml - :force: - - - - - 1 - 3 - 224 - 224 - - - - - -The ``input`` layer is converted from the TensorFlow graph ``Placeholder`` operation ``input`` and has the same name. - -The ``-b`` option is used here for conversion to override a possible undefined batch size (coded as -1 in TensorFlow models). If a model was frozen with a defined batch size, you may omit this option in all the examples. - -The last layer in the model is ``InceptionV1/Logits/Predictions/Reshape_1``, which matches an output operation in the TensorFlow graph: - -.. code-block:: xml - :force: - - - - - - 1 - 1001 - - - - - 1 - 1001 - - - - - -Due to automatic identification of inputs and outputs, providing the ``input`` and ``output`` options to convert the whole model is not required. The following commands are equivalent for the Inception V1 model: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1) - - ov_model = convert_model("inception_v1.pb", batch=1, input="input", output="InceptionV1/Logits/Predictions/Reshape_1") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --output_dir - - mo --input_model inception_v1.pb -b 1 --input input --output InceptionV1/Logits/Predictions/Reshape_1 --output_dir - - -The Intermediate Representations are identical for both conversions. The same is true if the model has multiple inputs and/or outputs. - -Model Cutting -#################### - -Now, consider how to cut some parts of the model off. This chapter describes the first convolution block ``InceptionV1/InceptionV1/Conv2d_1a_7x7`` of the Inception V1 model to illustrate cutting: - -.. image:: ../../../../assets/images/inception_v1_first_block.svg - :alt: Inception V1 first convolution block - -Cutting at the End -++++++++++++++++++++ - -If you want to cut your model at the end, you have the following options: - -1. The following command cuts off the rest of the model after the ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu``, making this node the last in the model: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1, output="InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --output=InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu --output_dir - - - The resulting Intermediate Representation has three layers: - - .. code-block:: xml - :force: - - - - - - - ... - - - - - - ... - - - ... - - - - - - - - - ... - - - ... - - - - - - - - - - - As shown in the TensorBoard picture, the original model has more nodes than its Intermediate Representation. Model conversion, using ``convert_model()``, consists of a set of model transformations, including fusing of batch normalization ``InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm`` with convolution ``InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution``, which is why it is not present in the final model. This is not an effect of the ``output`` option, it is the typical behavior of model conversion API for batch normalizations and convolutions. The effect of the ``output`` is that the ``ReLU`` layer becomes the last one in the converted model. - -2. The following command cuts the edge that comes from 0 output port of the ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu`` and the rest of the model, making this node the last one in the model: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1, output="InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu:0") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --output InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu:0 --output_dir - - - The resulting Intermediate Representation has three layers, which are the same as in the previous case: - - .. code-block:: xml - :force: - - - - - - - ... - - - - - - ... - - - ... - - - - - - - - - ... - - - ... - - - - - - - - - - - This type of cutting is useful for cutting multiple output edges. - -3. The following command cuts the edge that comes to 0 input port of the ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu`` and the rest of the model including ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu``, deleting this node and making the previous node ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Conv2D`` the last in the model: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1, output="0:InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --output=0:InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu --output_dir - - - The resulting Intermediate Representation has two layers, which are the same as the first two layers in the previous case: - - .. code-block:: xml - :force: - - - - - - - ... - - - - - - ... - - - ... - - - - - - - - - - - - - -Cutting from the Beginning -++++++++++++++++++++++++++ - -If you want to go further and cut the beginning of the model, leaving only the ``ReLU`` layer, you have the following options: - -1. Use the following parameters, where ``input`` and ``output`` specify the same node in the graph: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1, output="InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu", input="InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model=inception_v1.pb -b 1 --output InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu --input InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu --output_dir - - - The resulting Intermediate Representation looks as follows: - - .. code-block:: xml - :force: - - - - - - - ... - - - - - ... - - - ... - - - - - - - - - - ``Input`` layer is automatically created to feed the layer that is converted from the node specified in ``input``, which is ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu`` in this case. ``convert_model()`` does not replace the ``ReLU`` node by the ``Input`` layer. It produces such ``ov.Model`` to make the node the first executable node in the final Intermediate Representation. Therefore, model conversion creates enough ``Inputs`` to feed all input ports of the node that is passed in ``input``. - - Even though ``input_shape`` is not specified in the command line, the shapes for layers are inferred from the beginning of the original TensorFlow model to the point, at which the new input is defined. It has the same shape ``[1,64,112,112]`` as the model converted as a whole or without cutting off the beginning. - -2. Cut the edge incoming to layer by port number. To specify the incoming port, use the following notation ``input=port:input_node``. To cut everything before ``ReLU`` layer, cut the edge incoming to port 0 of ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu`` node: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1, input="0:InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu", output="InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --input 0:InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu --output InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu --output_dir - - - The resulting Intermediate Representation looks as follows: - - .. code-block:: xml - :force: - - - - - - - ... - - - - - ... - - - ... - - - - - - - - - - ``Input`` layer is automatically created to feed the layer that is converted from the node specified in ``input``, which is ``InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu`` in this case. ``convert_model()`` does not replace the ``ReLU`` node by the ``Input`` layer, it produces such ``ov.Model`` to make the node be the first executable node in the final Intermediate Representation. Therefore, ``convert_model()`` creates enough ``Inputs`` to feed all input ports of the node that is passed in ``input``. - - Even though ``input_shape`` is not specified in the command line, the shapes for layers are inferred from the beginning of the original TensorFlow model to the point, at which the new input is defined. It has the same shape ``[1,64,112,112]`` as the model converted as a whole or without cutting off the beginning. - -3. Cut edge outcoming from layer by port number. To specify the outcoming port, use the following notation ``input=input_node:port``. To cut everything before ``ReLU`` layer, cut edge from ``InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/add_1`` node to ``ReLU``: - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1, input="InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/add_1:0", output="InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --input InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/add_1:0 --output InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu --output_dir - - - The resulting Intermediate Representation looks as follows: - - .. code-block:: xml - :force: - - - - - - - ... - - - - - ... - - - ... - - layer> - - - - - - - -Inputs with Multiple Input Ports -################################ - -There are operations that contain more than one input port. In the example considered here, the convolution ``InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution`` is such operation. When ``input_shape`` is not provided, a new ``Input`` layer is created for each dynamic input port for the node. If a port is evaluated to a constant blob, this constant remains in the model and a corresponding input layer is not created. TensorFlow convolution used in this model contains two ports: - -* port 0: input tensor for convolution (dynamic) -* port 1: convolution weights (constant) - -Following this behavior, ``convert_model()`` creates an ``Input`` layer for port 0 only, leaving port 1 as a constant. Thus, the result of: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", batch=1, input="InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --input InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution --output_dir - - -is identical to the result of conversion of the model as a whole, because this convolution is the first executable operation in Inception V1. - -Different behavior occurs when ``input_shape`` is also used as an attempt to override the input shape: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", input="InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution", input_shape=[1,224,224,3]) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb--input=InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution --input_shape [1,224,224,3] --output_dir - - -An error occurs (for more information, see the :ref:`Model Conversion FAQ `): - -.. code-block:: sh - - [ ERROR ] Node InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution has more than 1 input and input shapes were provided. - Try not to provide input shapes or specify input port with PORT:NODE notation, where PORT is an integer. - For more information, see FAQ #30 - -When ``input_shape`` is specified and the node contains multiple input ports, you need to provide an input port index together with an input node name. The input port index is specified in front of the node name with ``‘:’`` as a separator (``PORT:NODE``). In this case, the port index 0 of the node ``InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution`` should be specified as ``0:InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution``. - -The correct command line is: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("inception_v1.pb", input="0:InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution", input_shape=[1,224,224,3]) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model inception_v1.pb --input 0:InceptionV1/InceptionV1/Conv2d_1a_7x7/convolution --input_shape=[1,224,224,3] --output_dir - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation.rst deleted file mode 100644 index 1e1fe61e717eb3..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation.rst +++ /dev/null @@ -1,253 +0,0 @@ -[LEGACY] Embedding Preprocessing Computation -===================================================== - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Conversion Parameters <../../../../openvino-workflow/model-preparation/conversion-parameters>` article. - -Input data for inference can be different from the training dataset and requires -additional preprocessing before inference. To accelerate the whole pipeline including -preprocessing and inference, model conversion API provides special parameters such as ``mean_values``, -``scale_values``, ``reverse_input_channels``, and ``layout``. - -Based on these parameters, model conversion API generates OpenVINO IR with additionally inserted sub-graphs -to perform the defined preprocessing. This preprocessing block can perform mean-scale -normalization of input data, reverting data along channel dimension, and changing -the data layout. See the following sections for details on the parameters, or the -:doc:`Overview of Preprocessing API <../../../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing>` -for the same functionality in OpenVINO Runtime. - -Specifying Layout -################# - -You may need to set input layouts, as it is required by some preprocessing, for -example, setting a batch, applying mean or scales, and reversing input channels (BGR<->RGB). - -Layout defines the meaning of dimensions in shape and can be specified for both -inputs and outputs. Some preprocessing requires to set input layouts, for example, -setting a batch, applying mean or scales, and reversing input channels (BGR<->RGB). - -For the layout syntax, check the :doc:`Layout API overview <../../../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview>`. -To specify the layout, you can use the ``layout`` option followed by the layout value. - -For example, the following command specifies the ``NHWC`` layout for a Tensorflow -``nasnet_large`` model that was exported to the ONNX format: - - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("tf_nasnet_large.onnx", layout="nhwc") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model tf_nasnet_large.onnx --layout nhwc - - -Additionally, if a model has more than one input or needs both input and output -layouts specified, you need to provide the name of each input or output to apply the layout. - -For example, the following command specifies the layout for an ONNX ``Yolo v3 Tiny`` -model with its first input ``input_1`` in ``NCHW`` layout and second input ``image_shape`` -having two dimensions: batch and size of the image expressed as the ``N?`` layout: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("yolov3-tiny.onnx", layout={"input_1": "nchw", "image_shape": "n?"}) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model yolov3-tiny.onnx --layout input_1(nchw),image_shape(n?) - - -Changing Model Layout -##################### - -Changing the model layout may be necessary if it differs from the one presented by input data. -Use either ``layout`` or ``source_layout`` with ``target_layout`` to change the layout. - -For example, for the same ``nasnet_large`` model mentioned previously, you can use -the following commands to provide data in the ``NCHW`` layout: - - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("tf_nasnet_large.onnx", source_layout="nhwc", target_layout="nchw") - - ov_model = convert_model("tf_nasnet_large.onnx", layout="nhwc->nchw") - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model tf_nasnet_large.onnx --source_layout nhwc --target_layout nchw - - mo --input_model tf_nasnet_large.onnx --layout "nhwc->nchw" - - -Again, if a model has more than one input or needs both input and output layouts -specified, you need to provide the name of each input or output to apply the layout. - -For example, to provide data in the ``NHWC`` layout for the `Yolo v3 Tiny` model -mentioned earlier, use the following commands: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("yolov3-tiny.onnx", source_layout={"input_1": "nchw", "image_shape": "n?"}, target_layout={"input_1": "nhwc"}) - - ov_model = convert_model("yolov3-tiny.onnx", layout={"input_1": "nchw->nhwc", "image_shape": "n?"} - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model yolov3-tiny.onnx --source_layout "input_1(nchw),image_shape(n?)" --target_layout "input_1(nhwc)" - - mo --input_model yolov3-tiny.onnx --layout "input_1(nchw->nhwc),image_shape(n?)" - - -Specifying Mean and Scale Values -################################ - -Neural network models are usually trained with the normalized input data. This -means that the input data values are converted to be in a specific range, for example, -``[0, 1]`` or ``[-1, 1]``. Sometimes, the mean values (mean images) are subtracted -from the input data values as part of the preprocessing. - -There are two cases of how the input data preprocessing is implemented. - -* The input preprocessing operations are a part of a model. - - In this case, the application does not perform a separate preprocessing step: - everything is embedded into the model itself. ``convert_model()`` will generate the - ov.Model with required preprocessing operations, and no ``mean`` and - ``scale`` parameters are required. -* The input preprocessing operations are not a part of a model and the preprocessing - is performed within the application which feeds the model with input data. - - In this case, information about mean/scale values should be provided to ``convert_model()`` - to embed it to the generated ``ov.Model``. - -Model conversion API represented by ``convert_model()`` provides command-line parameters -to specify the values: ``mean_values``, ``scale_values``, ``scale``. Using these parameters, -model conversion API embeds the corresponding preprocessing block for mean-value -normalization of the input data and optimizes this block so that the preprocessing -takes negligible time for inference. - -For example, the following command runs model conversion for the PaddlePaddle UNet -model and applies mean-scale normalization to the input data: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("unet.pdmodel", mean_values=[123,117,104], scale=255) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model unet.pdmodel --mean_values [123,117,104] --scale 255 - - -Reversing Input Channels -######################## - -Sometimes, input images for your application can be of the RGB (or BGR) format -and the model is trained on images of the BGR (or RGB) format, which is in the -opposite order of color channels. In this case, it is important to preprocess the -input images by reverting the color channels before inference. - -To embed this preprocessing step into ``ov.Model``, model conversion API provides the -``reverse_input_channels`` command-line parameter to shuffle the color channels. - -The ``reverse_input_channels`` parameter can be used to preprocess the model -input in the following cases: - -* Only one dimension in the input shape has a size equal to ``3``. -* One dimension has an undefined size and is marked as ``C`` channel using ``layout`` parameters. - -Using the ``reverse_input_channels`` parameter, model conversion API embeds the corresponding -preprocessing block for reverting the input data along channel dimension and optimizes -this block so that the preprocessing takes only negligible time for inference. - -For example, the following command launches model conversion for the TensorFlow AlexNet -model and embeds the ``reverse_input_channel`` preprocessing block into OpenVINO IR: - - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("alexnet.pb", reverse_input_channels=True) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model alexnet.pb --reverse_input_channels - - -.. note:: - - If both mean and scale values are specified, the mean is subtracted first and - then the scale is applied regardless of the order of options in the command-line. - Input values are *divided* by the scale value(s). If the ``reverse_input_channels`` - option is also used, ``reverse_input_channels`` will be applied first, then ``mean`` - and after that ``scale``. The data flow in the model looks as follows: - ``Parameter -> ReverseInputChannels -> Mean apply-> Scale apply -> the original body of the model``. - -Additional Resources -#################### - -* :doc:`Overview of Preprocessing API <../../../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-model-optimizer-faq.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-model-optimizer-faq.rst deleted file mode 100644 index f035101d715e9b..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-model-optimizer-faq.rst +++ /dev/null @@ -1,947 +0,0 @@ -[LEGACY] Model Optimizer Frequently Asked Questions -=========================================================== - - -.. important:: - - All of the issues below refer to :doc:`legacy functionalities <../legacy-model-optimizer-extensibility>`. - -If your question is not covered by the topics below, use the -`OpenVINO Support page `__, -where you can participate in a free forum discussion. - -.. warning:: - - Note that OpenVINO support for Apache MXNet, Caffe, and Kaldi has been discontinued. - -.. _question-1: - -Q1. What does the message "[ ERROR ]: Current caffe.proto does not contain field" mean? -##################################################################################################################################################### - -**A:** Internally, Model Optimizer uses a protobuf library to parse and load Caffe models. This library requires a file grammar and a generated parser. For a Caffe fallback, Model Optimizer uses a Caffe-generated parser for a Caffe-specific ``.proto`` file (which is usually located in the ``src/caffe/proto`` directory). Make sure that you install exactly the same version of Caffe (with Python interface) as that was used to create the model. - -If you just want to experiment with Model Optimizer and test a Python extension for working with your custom -layers without building Caffe, add the layer description to the ``caffe.proto`` file and generate a parser for it. - -For example, to add the description of the ``CustomReshape`` layer, which is an artificial layer not present in any ``caffe.proto`` files: - -1. Add the following lines to the ``caffe.proto`` file: - - .. code-block:: shell - - package mo_caffe; // To avoid conflict with Caffe system, it is highly recommended to specify different package name. - ... - message LayerParameter { - // Other layers parameters description. - ... - optional CustomReshapeParameter custom_reshape_param = 546; // 546 - ID is any number not present in caffe.proto. - } - // The lines from here to the end of the file are describing contents of this parameter. - message CustomReshapeParameter { - optional BlobShape shape = 1; // Just use the same parameter type as some other Caffe layers. - } - - -2. Generate a new parser: - - .. code-block:: shell - - cd /openvino/tools/mo/front/caffe/proto - python3 generate_caffe_pb2.py --input_proto /src/caffe/proto/caffe.proto - - - where ``PATH_TO_CUSTOM_CAFFE`` is the path to the root directory of custom Caffe. - -3. Now, Model Optimizer is able to load the model into memory and start working with your extensions if there are any. - - However, since your model has custom layers, you must register them as custom. To learn more about it, refer to the :doc:`[Legacy] Custom Layers in Model Optimizer <../legacy-model-optimizer-extensibility>`. - -.. _question-2: - -Q2. How do I create a bare caffemodel, if I have only prototxt? -##################################################################################################################################################### - -**A:** You need the Caffe Python interface. In this case, do the following: - -.. code-block:: shell - - python3 - import caffe - net = caffe.Net('/my_net.prototxt', caffe.TEST) - net.save('/my_net.caffemodel') - - -.. _question-3: - -Q3. What does the message "[ ERROR ]: Unable to create ports for node with id" mean? -##################################################################################################################################################### - -**A:** Most likely, Model Optimizer does not know how to infer output shapes of some layers in the given topology. -To lessen the scope, compile the list of layers that are custom for Model Optimizer: present in the topology, -absent in the :doc:`list of supported operations <../../../../about-openvino/compatibility-and-support/supported-operations>` for the target framework. -Then, refer to available options in the corresponding section in the :doc:`[Legacy] Custom Layers in Model Optimizer <../legacy-model-optimizer-extensibility>` page. - -.. _question-7: - -Q7. What does the message "Invalid proto file: there is neither 'layer' nor 'layers' top-level messages" mean? -##################################################################################################################################################### - -**A:** The structure of any Caffe topology is described in the ``caffe.proto`` file of any Caffe version. For example, the following ``.proto`` file in Model Optimizer is used by default: ``mo/front/caffe/proto/my_caffe.proto``, with the structure: - -.. code-block:: sh - - message NetParameter { - // ... some other parameters - // The layers that make up the net. Each of their configurations, including - // connectivity and behavior, is specified as a LayerParameter. - repeated LayerParameter layer = 100; // ID 100 so layers are printed last. - // DEPRECATED: use 'layer' instead. - repeated V1LayerParameter layers = 2; - } - - -This means that any topology should contain layers as top-level structures in ``prototxt``. For example, see the `LeNet topology `__. - -.. _question-8: - -Q8. What does the message "Old-style inputs (via 'input_dims') are not supported. Please specify inputs via 'input_shape'" mean? -##################################################################################################################################################### - -**A:** The structure of any Caffe topology is described in the ``caffe.proto`` file for any Caffe version. For example, the following ``.proto`` file in Model Optimizer is used by default: ``mo/front/caffe/proto/my_caffe.proto``, with the structure: - -.. code-block:: sh - - message NetParameter { - - optional string name = 1; // consider giving the network a name - // DEPRECATED. See InputParameter. The input blobs to the network. - repeated string input = 3; - // DEPRECATED. See InputParameter. The shape of the input blobs. - repeated BlobShape input_shape = 8; - // 4D input dimensions -- deprecated. Use "input_shape" instead. - // If specified, for each input blob there should be four - // values specifying the num, channels, height and width of the input blob. - // Thus, there should be a total of (4 * #input) numbers. - repeated int32 input_dim = 4; - // ... other parameters - } - - -Therefore, the input layer of the provided model must be specified in one of the following styles: - -* - - .. code-block:: sh - - input: "data" - input_shape - { - dim: 1 - dim: 3 - dim: 227 - dim: 227 - } - - -* - - .. code-block:: sh - - input: "data" - input_shape - { - dim: 1 - dim: 3 - dim: 600 - dim: 1000 - } - input: "im_info" - input_shape - { - dim: 1 - dim: 3 - } - -* - - .. code-block:: sh - - layer - { - name: "data" - type: "Input" - top: "data" - input_param {shape: {dim: 1 dim: 3 dim: 600 dim: 1000}} - } - layer - { - name: "im_info" - type: "Input" - top: "im_info" - input_param {shape: {dim: 1 dim: 3}} - } - -* - - .. code-block:: sh - - input: "data" - input_dim: 1 - input_dim: 3 - input_dim: 500 - - -However, if your model contains more than one input, Model Optimizer is able to convert the model with inputs specified in one of the first three forms in the above list. The 4th form is not supported for multi-input topologies. - -.. _question-9: - -Q9. What does the message "Mean file for topologies with multiple inputs is not supported" mean? -##################################################################################################################################################### - -**A:** Model Optimizer does not support mean file processing for topologies with more than one input. In this case, you need to perform preprocessing of the inputs for a generated Intermediate Representation in OpenVINO Runtime to perform subtraction for every input of your multi-input model. See the :doc:`Overview of Preprocessing <../../../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing>` for details. - -.. _question-11: - -Q11. What does the message "Invalid prototxt file: value error" mean? -##################################################################################################################################################### - -**A:** There are multiple reasons why Model Optimizer does not accept a Caffe topology. See FAQs :ref:`#7 ` and :ref:`#20 `. - -.. _question-12: - -Q12. What does the message "Error happened while constructing caffe.Net in the Caffe fallback function" mean? -##################################################################################################################################################### - -**A:** Model Optimizer tried to infer a specified layer via the Caffe framework. However, it cannot construct a net using the Caffe Python interface. Make sure that your ``caffemodel`` and ``prototxt`` files are correct. To ensure that the problem is not in the ``prototxt`` file, see FAQ :ref:`#2 `. - -.. _question-13: - -Q13. What does the message "Cannot infer shapes due to exception in Caffe" mean? -##################################################################################################################################################### - -**A:** Model Optimizer tried to infer a custom layer via the Caffe framework, but the model could not be inferred using Caffe. This might happen if you try to convert the model with some noise weights and biases, which conflict with layers that have dynamic shapes. You should write your own extension for every custom layer your topology might have. For more details, refer to the :doc:`[Legacy] Model Optimizer Extensibility <../legacy-model-optimizer-extensibility>` page. - -.. _question-14: - -Q14. What does the message "Cannot infer shape for node {} because there is no Caffe available. Please register python infer function for op or use Caffe for shape inference" mean? -#################################################################################################################################################################################### - -**A:** Your model contains a custom layer and you have correctly registered it with the ``CustomLayersMapping.xml`` file. These steps are required to offload shape inference of the custom layer with the help of the system Caffe. However, Model Optimizer could not import a Caffe package. Make sure that you have built Caffe with a ``pycaffe`` target and added it to the ``PYTHONPATH`` environment variable. At the same time, it is highly recommended to avoid dependency on Caffe and write your own Model Optimizer extension for your custom layer. For more information, refer to FAQ :ref:`#44 `. - -.. _question-15: - -Q15. What does the message "Framework name can not be deduced from the given options. Use --framework to choose one of Caffe, TensorFlow, MXNet" mean? -###################################################################################################################################################### - -**A:** You have run Model Optimizer without a flag ``--framework caffe|tf``. Model Optimizer tries to deduce the framework by the extension of input model file (``.pb`` for TensorFlow, ``.caffemodel`` for Caffe, ``.params`` for Apache MXNet). Your input model might have a different extension and you need to explicitly set the source framework. For example, use ``--framework caffe``. - -.. _question-16: - -Q16. What does the message "Input shape is required to convert MXNet model. Please provide it with --input_shape" mean? -##################################################################################################################################################### - -**A:** Input shape was not provided. That is mandatory for converting an MXNet model to the OpenVINO Intermediate Representation, because MXNet models do not contain information about input shapes. Use the ``--input_shape`` flag to specify it. For more information about using the ``--input_shape``, refer to FAQ :ref:`#56 `. - -.. _question-17: - -.. _question-18: - -.. _question-19: - -Q19. What does the message "Both --scale and --scale_values are defined. Specify either scale factor or scale values per input channels" mean? -##################################################################################################################################################### - -**A:** The ``--scale`` option sets a scaling factor for all channels, while ``--scale_values`` sets a scaling factor per each channel. Using both of them simultaneously produces ambiguity, so you must use only one of them. For more information, refer to the **Using Framework-Agnostic Conversion Parameters** section: for :doc:`Converting a TensorFlow Model <[legacy]-supported-model-formats/[legacy]-convert-tensorflow>`. - -.. _question-20: - -Q20. What does the message "Cannot find prototxt file: for Caffe please specify --input_proto - a protobuf file that stores topology and --input_model that stores pre-trained weights" mean? -############################################################################################################################################################################################## - -**A:** Model Optimizer cannot find a ``.prototxt`` file for a specified model. By default, it must be located in the same directory as the input model with the same name (except extension). If any of these conditions is not satisfied, use ``--input_proto`` to specify the path to the ``.prototxt`` file. - -.. _question-21: - -.. _question-22: - -Q22. What does the message "Failed to create directory .. . Permission denied!" mean? -##################################################################################################################################################### - -**A:** Model Optimizer cannot create a directory specified via ``--output_dir``. Make sure that you have enough permissions to create the specified directory. - -.. _question-23: - -Q23. What does the message "Discovered data node without inputs and value" mean? -##################################################################################################################################################### - -**A:** One of the layers in the specified topology might not have inputs or values. Make sure that the provided ``caffemodel`` and ``protobuf`` files are correct. - -.. _question-24: - -Q24. What does the message "Part of the nodes was not translated to IE. Stopped" mean? -##################################################################################################################################################### - -**A:** Some of the operations are not supported by OpenVINO Runtime and cannot be translated to OpenVINO Intermediate Representation. You can extend Model Optimizer by allowing generation of new types of operations and implement these operations in the dedicated OpenVINO plugins. For more information, refer to the :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` guide. - -.. _question-25: - -Q25. What does the message "While creating an edge from .. to .. : node name is undefined in the graph. Check correctness of the input model" mean? -##################################################################################################################################################### - -**A:** Model Optimizer cannot build a graph based on a specified model. Most likely, it is incorrect. - -.. _question-26: - -Q26. What does the message "Node does not exist in the graph" mean? -##################################################################################################################################################### - -**A:** You might have specified an output node via the ``--output`` flag that does not exist in a provided model. Make sure that the specified output is correct and this node exists in the current model. - -.. _question-27: - -Q27. What does the message "--input parameter was provided. Other inputs are needed for output computation. Provide more inputs or choose another place to cut the net" mean? -############################################################################################################################################################################## - -**A:** Most likely, Model Optimizer tried to cut the model by a specified input. However, other inputs are needed. - -.. _question-28: - -Q28. What does the message "Placeholder node does not have an input port, but input port was provided" mean? -##################################################################################################################################################### - -**A:** You might have specified a placeholder node with an input node, while the placeholder node does not have it in the model. - -.. _question-29: - -Q29. What does the message "Port index is out of number of available input ports for node" mean? -##################################################################################################################################################### - -**A:** This error occurs when an incorrect input port is specified with the ``--input`` command line argument. When using ``--input``, you may optionally specify an input port in the form: ``X:node_name``, where ``X`` is an integer index of the input port starting from 0 and ``node_name`` is the name of a node in the model. This error occurs when the specified input port ``X`` is not in the range 0..(n-1), where n is the number of input ports for the node. Specify a correct port index, or do not use it if it is not needed. - -.. _question-30: - -Q30. What does the message "Node has more than 1 input and input shapes were provided. Try not to provide input shapes or specify input port with PORT:NODE notation, where PORT is an integer" mean? -###################################################################################################################################################################################################### - -**A:** This error occurs when an incorrect combination of the ``--input`` and ``--input_shape`` command line options is used. Using both ``--input`` and ``--input_shape`` is valid only if ``--input`` points to the ``Placeholder`` node, a node with one input port or ``--input`` has the form ``PORT:NODE``, where ``PORT`` is an integer port index of input for node ``NODE``. Otherwise, the combination of ``--input`` and ``--input_shape`` is incorrect. - - -.. _question-31: - -Q31. What does the message "Input port > 0 in --input is not supported if --input_shape is not provided. Node: NAME_OF_THE_NODE. Omit port index and all input ports will be replaced by placeholders. Or provide --input_shape" mean? -####################################################################################################################################################################################################################################### - -**A:** When using the ``PORT:NODE`` notation for the ``--input`` command line argument and ``PORT`` > 0, you should specify ``--input_shape`` for this input. This is a limitation of the current Model Optimizer implementation. - -.. note:: It is no longer relevant message since the limitation on input port index for model truncation has been resolved. - -.. _question-32: - -Q32. What does the message "No or multiple placeholders in the model, but only one shape is provided, cannot set it" mean? -##################################################################################################################################################### - -**A:** You might have provided only one shape for the placeholder, while there are none or multiple inputs in the model. Make sure that you have provided the correct data for placeholder nodes. - -.. _question-33: - -Q33. What does the message "The amount of input nodes for port is not equal to 1" mean? -##################################################################################################################################################### - -**A:** This error occurs when the ``SubgraphMatch.single_input_node`` function is used for an input port that supplies more than one node in a sub-graph. The ``single_input_node`` function can be used only for ports that has a single consumer inside the matching sub-graph. When multiple nodes are connected to the port, use the ``input_nodes`` function or ``node_by_pattern`` function instead of ``single_input_node``. For more details, refer to the **Graph Transformation Extensions** section in the :doc:`[Legacy] Model Optimizer Extensibility <../legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions>` guide. - -.. _question-34: - -Q34. What does the message "Output node for port has already been specified" mean? -##################################################################################################################################################### - -**A:** This error occurs when the ``SubgraphMatch._add_output_node`` function is called manually from user's extension code. This is an internal function, and you should not call it directly. - -.. _question-35: - -Q35. What does the message "Unsupported match kind.... Match kinds "points" or "scope" are supported only" mean? -##################################################################################################################################################### - -**A:** While using configuration file to implement a TensorFlow front replacement extension, an incorrect match kind was used. Only ``points`` or ``scope`` match kinds are supported. For more details, refer to the :doc:`[Legacy] Model Optimizer Extensibility <../legacy-model-optimizer-extensibility>` guide. - -.. _question-36: - -Q36. What does the message "Cannot write an event file for the TensorBoard to directory" mean? -##################################################################################################################################################### - -**A:** Model Optimizer tried to write an event file in the specified directory but failed to do that. That could happen when the specified directory does not exist or you do not have permissions to write in it. - -.. _question-37: - -Q37. What does the message "There is no registered 'infer' function for node with op = .. . Please implement this function in the extensions" mean? -##################################################################################################################################################### - -**A** Most likely, you tried to extend Model Optimizer with a new primitive, but you did not specify an infer function. For more information on extensions, see the :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` guide. - -.. _question-38: - -Q38. What does the message "Stopped shape/value propagation at node" mean? -##################################################################################################################################################### - -**A:** Model Optimizer cannot infer shapes or values for the specified node. It can happen because of the following reasons: a bug exists in the custom shape infer function, the node inputs have incorrect values/shapes, or the input shapes are incorrect. - -.. _question-39: - -Q39. What does the message "The input with shape .. does not have the batch dimension" mean? -##################################################################################################################################################### - -**A:** Batch dimension is the first dimension in the shape and it should be equal to 1 or undefined. In your case, it is not either equal to 1 or undefined, which is why the ``-b`` shortcut produces undefined and unspecified behavior. To resolve the issue, specify full shapes for each input with the ``--input_shape`` option. Run Model Optimizer with the ``--help`` option to learn more about the notation for input shapes. - -.. _question-40: - -Q40. What does the message "Not all output shapes were inferred or fully defined for node" mean? -##################################################################################################################################################### - -**A:** Most likely, the shape is not defined (partially or fully) for the specified node. You can use ``--input_shape`` with positive integers to override model input shapes. - -.. _question-41: - -Q41. What does the message "Shape for tensor is not defined. Can not proceed" mean? -##################################################################################################################################################### - -**A:** This error occurs when the ``--input`` command-line option is used to cut a model and ``--input_shape`` is not used to override shapes for a node, so a shape for the node cannot be inferred by Model Optimizer. You need to help Model Optimizer by specifying shapes with ``--input_shape`` for each node specified with the ``--input`` command-line option. - -.. _question-42: - -Q42. What does the message "Module TensorFlow was not found. Please install TensorFlow 1.2 or higher" mean? -##################################################################################################################################################### - -**A:** To convert TensorFlow models with Model Optimizer, TensorFlow 1.2 or newer must be installed. For more information on prerequisites, see the :doc:`Configuring Model Optimizer <../legacy-conversion-api>` guide. - -.. _question-43: - -Q43. What does the message "Cannot read the model file: it is incorrect TensorFlow model file or missing" mean? -##################################################################################################################################################### - -**A:** The model file should contain a frozen TensorFlow graph in the text or binary format. Make sure that ``--input_model_is_text`` is provided for a model in the text format. By default, a model is interpreted as binary file. - -.. _question-44: - -Q44. What does the message "Cannot pre-process TensorFlow graph after reading from model file. File is corrupt or has unsupported format" mean? -##################################################################################################################################################### - -**A:** Most likely, there is a problem with the specified file for the model. The file exists, but it has an invalid format or is corrupted. - -.. _question-45: - -Q45. What does the message "Found custom layer. Model Optimizer does not support this layer. Please, register it in CustomLayersMapping.xml or implement extension" mean? -########################################################################################################################################################################## - -**A:** This means that the layer ``{layer_name}`` is not supported in Model Optimizer. You will find a list of all unsupported layers in the corresponding section. You should implement the extensions for this layer. See :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` for more information. - -.. _question-46: - -Q46. What does the message "Custom replacement configuration file does not exist" mean? -##################################################################################################################################################### - -**A:** A path to the custom replacement configuration file was provided with the ``--transformations_config`` flag, but the file could not be found. Make sure the specified path is correct and the file exists. - -.. _question-47: - -Q47. What does the message "Extractors collection have case insensitive duplicates" mean? -##################################################################################################################################################### - -**A:** When extending Model Optimizer with new primitives, keep in mind that their names are case-insensitive. Most likely, another operation with the same name is already defined. For more information, see the :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` guide. - -.. _question-48: - -Q48. What does the message "Input model name is not in an expected format, cannot extract iteration number" mean? -##################################################################################################################################################### - -**A:** Model Optimizer cannot load an MXNet model in the specified file format. Make sure you use the ``.json`` or ``.param`` format. - -.. _question-49: - -Q49. What does the message "Cannot convert type of placeholder because not all of its outputs are 'Cast' to float operations" mean? -##################################################################################################################################################### - -**A:** There are models where ``Placeholder`` has the UINT8 type and the first operation after it is 'Cast', which casts the input to FP32. Model Optimizer detected that the ``Placeholder`` has the UINT8 type, but the next operation is not 'Cast' to float. Model Optimizer does not support such a case. Make sure you change the model to have ``Placeholder`` for FP32. - -.. _question-50: - -Q50. What does the message "Data type is unsupported" mean? -##################################################################################################################################################### - -**A:** Model Optimizer cannot read the value with the specified data type. Currently, the following types are supported: bool, float16, float32, double, int8, int16, int32, int64, uint8, uint16, uint32, uint64, str. - -.. _question-51: - -Q51. What does the message "No node with name ..." mean? -##################################################################################################################################################### - -**A:** Model Optimizer tried to access a node that does not exist. This could happen if you have incorrectly specified placeholder, input or output node name. - -.. _question-52: - -Q52. What does the message "Module MXNet was not found. Please install MXNet 1.0.0" mean? -##################################################################################################################################################### - -**A:** To convert MXNet models with Model Optimizer, Apache MXNet 1.0.0 must be installed. For more information about prerequisites, see the :doc:`Configuring Model Optimizer <../legacy-conversion-api>` guide. - -.. _question-53: - -Q53. What does the message "The following error happened while loading MXNet model .." mean? -##################################################################################################################################################### - -**A:** Most likely, there is a problem with loading of the MXNet model. Make sure the specified path is correct, the model exists and is not corrupted, and you have sufficient permissions to work with it. - -.. _question-54: - -Q54. What does the message "The following error happened while processing input shapes: .." mean? -##################################################################################################################################################### - -**A:** Make sure inputs are defined and have correct shapes. You can use ``--input_shape`` with positive integers to override model input shapes. - -.. _question-55: - -Q55. What does the message "Attempt to register of custom name for the second time as class. Note that custom names are case-insensitive" mean? -##################################################################################################################################################### - -**A:** When extending Model Optimizer with new primitives, keep in mind that their names are case-insensitive. Most likely, another operation with the same name is already defined. For more information, see the :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` guide. - -.. _question-56: - -Q56. What does the message "Both --input_shape and --batch were provided. Please, provide only one of them" mean? -##################################################################################################################################################### - -**A:** Specifying the batch and the input shapes at the same time is not supported. You must specify a desired batch as the first value of the input shape. - -.. _question-57: - -Q57. What does the message "Input shape .. cannot be parsed" mean? -##################################################################################################################################################### - -**A:** The specified input shape cannot be parsed. Define it in one of the following ways: - -* - - .. code-block:: shell - - mo --input_model .caffemodel --input_shape (1,3,227,227) - -* - - .. code-block:: shell - - mo --input_model .caffemodel --input_shape [1,3,227,227] - -* In case of multi input topology you should also specify inputs: - - .. code-block:: shell - - mo --input_model /path-to/your-model.caffemodel --input data,rois --input_shape (1,3,227,227),(1,6,1,1) - - -Keep in mind that there is no space between and inside the brackets for input shapes. - -.. _question-58: - -Q58. What does the message "Please provide input layer names for input layer shapes" mean? -##################################################################################################################################################### - -**A:** When specifying input shapes for several layers, you must provide names for inputs, whose shapes will be overwritten. Additional information for ``--input_shape`` is in FAQ :ref:`#56 `. - -.. _question-59: - -Q59. What does the message "Values cannot be parsed" mean? -##################################################################################################################################################### - -**A:** Mean values for the given parameter cannot be parsed. It should be a string with a list of mean values. For example, in '(1,2,3)', 1 stands for the RED channel, 2 for the GREEN channel, 3 for the BLUE channel. - -.. _question-60: - -Q60. What does the message ".. channels are expected for given values" mean? -##################################################################################################################################################### - -**A:** The number of channels and the number of given values for mean values do not match. The shape should be defined as '(R,G,B)' or '[R,G,B]'. The shape should not contain undefined dimensions (? or -1). The order of values is as follows: (value for a RED channel, value for a GREEN channel, value for a BLUE channel). - -.. _question-61: - -Q61. What does the message "You should specify input for each mean value" mean? -##################################################################################################################################################### - -**A:** Most likely, you didn't specify inputs using ``--mean_values``. Specify inputs with the ``--input`` flag. For usage examples, refer to the FAQ :ref:`#62 `. - -.. _question-62: - -Q62. What does the message "You should specify input for each scale value" mean? -##################################################################################################################################################### - -**A:** Most likely, you didn't specify inputs using ``--scale_values``. Specify inputs with the ``--input`` flag. For usage examples, refer to the FAQ :ref:`#63 `. - -.. _question-63: - -Q63. What does the message "Number of inputs and mean values does not match" mean? -##################################################################################################################################################### - -**A:** The number of specified mean values and the number of inputs must be equal. - -.. _question-64: - -Q64. What does the message "Number of inputs and scale values does not match" mean? -##################################################################################################################################################### - -**A:** The number of specified scale values and the number of inputs must be equal. - -.. _question-65: - -Q65. What does the message "No class registered for match kind ... Supported match kinds are .. " mean? -##################################################################################################################################################### - -**A:** A replacement defined in the configuration file for sub-graph replacement, using node names patterns or start/end nodes, has the ``match_kind`` attribute. The attribute may have only one of the values: ``scope`` or ``points``. If a different value is provided, this error is displayed. - -.. _question-66: - -Q66. What does the message "No instance(s) is(are) defined for the custom replacement" mean? -##################################################################################################################################################### - -**A:** A replacement defined in the configuration file for sub-graph replacement, using node names patterns or start/end nodes, has the ``instances`` attribute. This attribute is mandatory. This error will occur if the attribute is missing. For more details, refer to the **Graph Transformation Extensions** section in the :doc:`[Legacy] Model Optimizer Extensibility <../legacy-model-optimizer-extensibility>` guide. - -.. _question-67: - -Q67. What does the message "The instance must be a single dictionary for the custom replacement with id .." mean? -##################################################################################################################################################### - -**A:** A replacement defined in the configuration file for sub-graph replacement, using start/end nodes, has the ``instances`` attribute. For this type of replacement, the instance must be defined with a dictionary with two keys ``start_points`` and ``end_points``. Values for these keys are lists with the start and end node names, respectively. For more details, refer to the **Graph Transformation Extensions** section in the :doc:`[Legacy] Model Optimizer Extensibility <../legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions>` guide. - -.. _question-68: - -Q68. What does the message "No instances are defined for replacement with id .. " mean? -##################################################################################################################################################### - -**A:** A replacement for the specified id is not defined in the configuration file. For more information, refer to the FAQ :ref:`#65 `. - -.. _question-69: - -Q69. What does the message "Custom replacements configuration file .. does not exist" mean? -##################################################################################################################################################### - -**A:** The path to a custom replacement configuration file was provided with the ``--transformations_config`` flag, but it cannot be found. Make sure the specified path is correct and the file exists. - -.. _question-70: - -Q70. What does the message "Failed to parse custom replacements configuration file .." mean? -##################################################################################################################################################### - -**A:** The file for custom replacement configuration provided with the ``--transformations_config`` flag cannot be parsed. In particular, it should have a valid JSON structure. For more details, refer to the `JSON Schema Reference `__ page. - -.. _question-71: - -Q71. What does the message "One of the custom replacements in the configuration file .. does not contain attribute 'id'" mean? -##################################################################################################################################################### - -**A:** Every custom replacement should declare a set of mandatory attributes and their values. For more details, refer to FAQ :ref:`#71 `. - -.. _question-72: - -Q72. What does the message "File .. validation failed" mean? -##################################################################################################################################################### - -**A:** The file for custom replacement configuration provided with the ``--transformations_config`` flag cannot pass validation. Make sure you have specified ``id``, ``instances``, and ``match_kind`` for all the patterns. - -.. _question-73: - -Q73. What does the message "Cannot update the file .. because it is broken" mean? -##################################################################################################################################################### - -**A:** The custom replacement configuration file provided with the ``--tensorflow_custom_operations_config_update`` cannot be parsed. Make sure that the file is correct and refer to FAQ :ref:`#68 `, :ref:`#69 `, :ref:`#70 `, and :ref:`#71 `. - -.. _question-74: - -Q74. What does the message "End node .. is not reachable from start nodes: .." mean? -##################################################################################################################################################### - -**A:** This error occurs when you try to make a sub-graph match. It is detected that between the start and end nodes that were specified as inputs/outputs for the subgraph to find, there are nodes marked as outputs but there is no path from them to the input nodes. Make sure the subgraph you want to match does actually contain all the specified output nodes. - -.. _question-75: - -Q75. What does the message "Sub-graph contains network input node .." mean? -##################################################################################################################################################### - -**A:** The start or end node for the sub-graph replacement using start/end nodes is specified incorrectly. Model Optimizer finds internal nodes of the sub-graph strictly "between" the start and end nodes, and then adds all input nodes to the sub-graph (and the inputs of their inputs, etc.) for these "internal" nodes. This error reports that Model Optimizer reached input node during this phase. This means that the start/end points are specified incorrectly in the configuration file. For more details, refer to the **Graph Transformation Extensions** section in the :doc:`[Legacy] Model Optimizer Extensibility <../legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions>` guide. - -.. _question-76: - -Q76. What does the message "... elements of ... were clipped to infinity while converting a blob for node [...] to ..." mean? -##################################################################################################################################################### - -**A:** This message may appear when the ``--compress_to_fp16`` command-line option is used. This option implies compression of all the model weights, biases, and other constant values to FP16. If a value of a constant is out of the range of valid FP16 values, the value is converted to positive or negative infinity. It may lead to incorrect results of inference or may not be a problem, depending on the model. The number of such elements and the total number of elements in the constant value is printed out together with the name of the node, where this value is used. - -.. _question-77: - -Q77. What does the message "... elements of ... were clipped to zero while converting a blob for node [...] to ..." mean? -##################################################################################################################################################### - -**A:** This message may appear when the ``--compress_to_fp16`` command-line option is used. This option implies conversion of all blobs in the mode to FP16. If a value in the blob is so close to zero that it cannot be represented as a valid FP16 value, it is converted to a true zero FP16 value. Depending on the model, it may lead to incorrect results of inference or may not be a problem. The number of such elements and the total number of elements in the blob are printed out together with a name of the node, where this blob is used. - -.. _question-78: - -Q78. What does the message "The amount of nodes matched pattern ... is not equal to 1" mean? -##################################################################################################################################################### - -**A:** This error occurs when the ``SubgraphMatch.node_by_pattern`` function is used with a pattern that does not uniquely identify a single node in a sub-graph. Try to extend the pattern string to make unambiguous match to a single sub-graph node. For more details, refer to the **Graph Transformation Extensions** section in the :doc:`[Legacy] Model Optimizer Extensibility <../legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions>` guide. - -.. _question-79: - -Q79. What does the message "The topology contains no "input" layers" mean? -##################################################################################################################################################### - -**A:** Your Caffe topology ``.prototxt`` file is intended for training. Model Optimizer expects a deployment-ready ``.prototxt`` file. To fix the problem, prepare a deployment-ready ``.prototxt`` file. Preparation of a deploy-ready topology usually results in removing ``data`` layer(s), adding ``input`` layer(s), and removing loss layer(s). - -.. _question-80: - -Q80. What does the message "Warning: please expect that Model Optimizer conversion might be slow" mean? -##################################################################################################################################################### - -**A:** You are using an unsupported Python version. Use only versions 3.4 - 3.6 for the C++ ``protobuf`` implementation that is supplied with OpenVINO toolkit. You can still boost the conversion speed by building the protobuf library from sources. For complete instructions about building ``protobuf`` from sources, see the appropriate section in the :doc:`Converting a Model to Intermediate Representation <../legacy-conversion-api>` guide. - -.. _question-81: - -Q81. What does the message "Arguments --nd_prefix_name, --pretrained_model_name and --input_symbol should be provided. Please provide all or do not use any." mean? -#################################################################################################################################################################### - -**A:** This error occurs if you did not provide the ``--nd_prefix_name``, ``--pretrained_model_name``, and ``--input_symbol`` parameters. -Model Optimizer requires both ``.params`` and ``.nd`` model files to merge into the result file (``.params``). -Topology description (``.json`` file) should be prepared (merged) in advance and provided with the ``--input_symbol`` parameter. - -If you add additional layers and weights that are in ``.nd`` files to your model, Model Optimizer can build a model -from one ``.params`` file and two additional ``.nd`` files (``*_args.nd``, ``*_auxs.nd``). -To do that, provide both CLI options or do not pass them if you want to convert an MXNet model without additional weights. - -.. _question-82: - -Q82. What does the message "You should specify input for mean/scale values" mean? -##################################################################################################################################################### - -**A:** When the model has multiple inputs and you want to provide mean/scale values, you need to pass those values for each input. More specifically, the number of passed values should be the same as the number of inputs of the model. -For more information, refer to the :doc:`Converting a Model to Intermediate Representation <[legacy]-setting-input-shapes>` guide. - -.. _question-83: - -Q83. What does the message "Input with name ... not found!" mean? -##################################################################################################################################################### - -**A:** When you passed the mean/scale values and specify names of input layers of the model, you might have used the name that does not correspond to any input layer. Make sure that you list only names of the input layers of your model when passing values with the ``--input`` option. -For more information, refer to the :doc:`Converting a Model to Intermediate Representation <[legacy]-setting-input-shapes>` guide. - -.. _question-84: - -Q84. What does the message "Specified input json ... does not exist" mean? -##################################################################################################################################################### - -**A:** Most likely, ``.json`` file does not exist or has a name that does not match the notation of Apache MXNet. Make sure the file exists and has a correct name. - -.. _question-85: - -Q85. What does the message "Unsupported Input model file type ... Model Optimizer support only .params and .nd files format" mean? -##################################################################################################################################################### - -**A:** Model Optimizer for Apache MXNet supports only ``.params`` and ``.nd`` files formats. Most likely, you specified an unsupported file format in ``--input_model``. - -.. _question-86: - -Q86. What does the message "Operation ... not supported. Please register it as custom op" mean? -##################################################################################################################################################### - -**A:** Model Optimizer tried to load the model that contains some unsupported operations. -If you want to convert model that contains unsupported operations, you need to prepare extension for all such operations. -For more information, refer to the :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` guide. - -.. _question-87: - -Q87. What does the message "Can not register Op ... Please, call function 'register_caffe_python_extractor' with parameter 'name'" mean? -##################################################################################################################################################### - -**A:** This error appears if the class of implementation of ``Op`` for Python Caffe layer could not be used by Model Optimizer. Python layers should be handled differently comparing to ordinary Caffe layers. - -In particular, you need to call the function ``register_caffe_python_extractor`` and pass ``name`` as the second argument of the function. -The name should be the compilation of the layer name with the module name separated by a dot. - -For example, your topology contains this layer with type ``Python``: - -.. code-block:: py - :force: - - layer { - name: 'proposal' - type: 'Python' - ... - python_param { - module: 'rpn.proposal_layer' - layer: 'ProposalLayer' - param_str: "'feat_stride': 16" - } - } - - -The first step is to implement an extension for this layer in Model Optimizer as an ancestor of ``Op`` class: - -.. code-block:: py - :force: - - class ProposalPythonExampleOp(Op): - op = 'Proposal' - - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): - ... - - -It is mandatory to call two functions right after the implementation of that class: - -.. code-block:: py - :force: - - class ProposalPythonExampleOp(Op): - ... - - register_caffe_python_extractor(ProposalPythonExampleOp, 'rpn.proposal_layer.ProposalLayer') - Op.excluded_classes.append(ProposalPythonExampleOp) - - -Note that the first call ``register_caffe_python_extractor(ProposalPythonExampleOp, 'rpn.proposal_layer.ProposalLayer')`` registers an extension of the layer in Model Optimizer, which will be found by the specific name (mandatory to join module name and layer name): ``rpn.proposal_layer.ProposalLayer``. - -The second call prevents Model Optimizer from using this extension as if it is an extension for -a layer with type ``Proposal``. Otherwise, this layer can be chosen as an implementation of extension that can lead to potential issues. -For more information, refer to the :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` guide. - -.. _question-88: - -Q88. What does the message "Model Optimizer is unable to calculate output shape of Memory node .." mean? -##################################################################################################################################################### - -**A:** Model Optimizer supports only ``Memory`` layers, in which ``input_memory`` goes before ``ScaleShift`` or the ``FullyConnected`` layer. -This error message means that in your model the layer after input memory is not of the ``ScaleShift`` or ``FullyConnected`` type. -This is a known limitation. - -.. _question-89: - -Q89. What do the messages "File ... does not appear to be a Kaldi file (magic number does not match)", "Kaldi model should start with tag" mean? -######################################################################################################################################################### - -**A:** These error messages mean that Model Optimizer does not support your Kaldi model, because the ``checksum`` of the model is not -16896 (the model should start with this number), or the model file does not contain the ```` tag as a starting one. -Make sure that you provide a path to a true Kaldi model and try again. - -.. _question-90: - -Q90. What do the messages "Expect counts file to be one-line file." or "Expect counts file to contain list of integers" mean? -##################################################################################################################################################### - -**A:** These messages mean that the file counts you passed contain not one line. The count file should start with -``[`` and end with ``]``, and integer values should be separated by spaces between those brackets. - -.. _question-91: - -Q91. What does the message "Model Optimizer is not able to read Kaldi model .." mean? -##################################################################################################################################################### - -**A:** There are multiple reasons why Model Optimizer does not accept a Kaldi topology, including: -the file is not available or does not exist. Refer to FAQ :ref:`#88 `. - -.. _question-92: - -Q92. What does the message "Model Optimizer is not able to read counts file .." mean? -##################################################################################################################################################### - -**A:** There are multiple reasons why Model Optimizer does not accept a counts file, including: -the file is not available or does not exist. Refer to FAQ :ref:`#89 `. - -.. _question-93: - -Q93. What does the message "For legacy MXNet models Model Optimizer does not support conversion of old MXNet models (trained with 1.0.0 version of MXNet and lower) with custom layers." mean? -############################################################################################################################################################################################### - -**A:** This message means that if you have a model with custom layers and its JSON file has been generated with Apache MXNet version -lower than 1.0.0, Model Optimizer does not support such topologies. If you want to convert it, you have to rebuild -MXNet with unsupported layers or generate a new JSON file with Apache MXNet version 1.0.0 or higher. You also need to implement -OpenVINO extension to use custom layers. -For more information, refer to the :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` guide. - -.. _question-94: - -Q94. What does the message "Expected token ````, has ``...``" mean? -##################################################################################################################################################### - -**A:** This error messages mean that Model Optimizer does not support your Kaldi model, because the Net contains ``ParallelComponent`` that does not end with the ```` tag. -Make sure that you provide a path to a true Kaldi model and try again. - -.. _question-95: - -.. _question-96: - -.. _question-97: - -Q97. What does the message "Graph contains a cycle. Can not proceed .." mean? -##################################################################################################################################################### - -**A:** Model Optimizer supports only straightforward models without cycles. - -There are multiple ways to avoid cycles: - -For Tensorflow: - -* :doc:`Convert models, created with TensorFlow Object Detection API <[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-object-detection>` - -For all frameworks: - -1. :doc:`Replace cycle containing Sub-graph in Model Optimizer [Legacy Solution] <../legacy-model-optimizer-extensibility>` -2. See :doc:`OpenVINO Extensibility Mechanism <../../../openvino-extensibility>` - -or - -* Edit the model in its original framework to exclude cycle. - -.. _question-98: - -.. _question-99: - -.. _question-100: - -Q100. What does the message "Interp layer shape inference function may be wrong, please, try to update layer shape inference function in the file (extensions/ops/interp.op at the line ...)." mean? -#################################################################################################################################################################################################### - -**A:** There are many flavors of Caffe framework, and most layers in them are implemented identically. -However, there are exceptions. For example, the output value of layer Interp is calculated differently in Deeplab-Caffe and classic Caffe. Therefore, if your model contains layer Interp and the conversion of your model has failed, modify the ``interp_infer`` function in the ``extensions/ops/interp.op`` file according to the comments in the file. - -.. _question-101: - -Q101. What does the message "Mean/scale values should ..." mean? -##################################################################################################################################################### - -**A:** It means that your mean/scale values have a wrong format. Specify mean/scale values in the form of ``layer_name(val1,val2,val3)``. -You need to specify values for each input of the model. For more information, refer to the :doc:`Converting a Model to Intermediate Representation <[legacy]-setting-input-shapes>` guide. - -.. _question-102: - -Q102. What does the message "Operation _contrib_box_nms is not supported ..." mean? -##################################################################################################################################################### - -**A:** It means that you are trying to convert a topology contains the ``_contrib_box_nms`` operation which is not supported directly. However, the sub-graph of operations including ``_contrib_box_nms`` could be replaced with the DetectionOutput layer if your topology is one of the ``gluoncv`` topologies. Specify the ``--enable_ssd_gluoncv`` command-line parameter for Model Optimizer to enable this transformation. - -.. _question-103: - -Q103. What does the message "ModelOptimizer is not able to parse "\*.caffemodel" mean? -##################################################################################################################################################### - -**A:** If a ``*.caffemodel`` file exists and is correct, the error occurred possibly because of the use of Python protobuf implementation. In some cases, error messages may appear during model parsing, for example: "``utf-8`` codec can't decode byte 0xe0 in position 4: invalid continuation byte in field: mo_caffe.SpatialTransformerParameter.transform_type". You can either use a newer Python version (3.8 - 3.11) or build the ``cpp`` implementation of ``protobuf`` yourself for your version of Python. For the complete instructions about building ``protobuf`` from sources, see the appropriate section in the :doc:`Converting Models with Model Optimizer <../legacy-conversion-api>` guide. - -.. _question-104: - -.. _question-105: - -Q105. What does the message "The IR preparation was executed by the legacy MO path. ..." mean? -##################################################################################################################################################### - -**A:** For the models in ONNX format, there are two available paths of IR conversion. -The old one is handled by the old Python implementation, while the new one uses new C++ frontends. -Starting from the 2022.1 version, the default IR conversion path for ONNX models is processed using the new ONNX frontend. -Certain features, such as ``--extensions`` and ``--transformations_config``, are not yet fully supported on the new frontends. -The new frontends support only paths to shared libraries (.dll and .so) for ``--extensions``. They support JSON configurations with defined library fields for ``--transformations_config``. -Inputs freezing (enabled by ``--freeze_placeholder_with_value`` or ``--input`` arguments) is not supported by the new frontends. -The IR conversion falls back to the old path if a user does not select any expected path of conversion explicitly (with ``--use_new_frontend`` or ``--use_legacy_frontend`` MO arguments) and unsupported pre-defined scenario is detected on the new frontend path. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes.rst deleted file mode 100644 index 9e445742278568..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes.rst +++ /dev/null @@ -1,156 +0,0 @@ -[LEGACY] Setting Input Shapes -==================================== - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Setting Input Shapes <../../../../openvino-workflow/model-preparation/setting-input-shapes>` article. - -With model conversion API you can increase your model's efficiency by providing an additional shape definition, with these two parameters: `input_shape` and `static_shape`. - - -.. meta:: - :description: Learn how to increase the efficiency of a model with MO by providing an additional shape definition with the input_shape and static_shape parameters. - - -Specifying input_shape parameter -################################ - -``convert_model()`` supports conversion of models with dynamic input shapes that contain undefined dimensions. -However, if the shape of data is not going to change from one inference request to another, -it is recommended to set up static shapes (when all dimensions are fully defined) for the inputs. -Doing it at this stage, instead of during inference in runtime, can be beneficial in terms of performance and memory consumption. -To set up static shapes, model conversion API provides the ``input_shape`` parameter. -For more information on input shapes under runtime, refer to the :doc:`Changing input shapes <../../../../openvino-workflow/running-inference/changing-input-shape>` guide. -To learn more about dynamic shapes in runtime, refer to the :doc:`Dynamic Shapes <../../../../openvino-workflow/running-inference/dynamic-shapes>` guide. - -The OpenVINO Runtime API may present certain limitations in inferring models with undefined dimensions on some hardware. -In this case, the ``input_shape`` parameter and the :doc:`reshape method <../../../../openvino-workflow/running-inference/changing-input-shape>` can help to resolve undefined dimensions. - -For example, run model conversion for the TensorFlow MobileNet model with the single input -and specify the input shape of ``[2,300,300,3]``: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("MobileNet.pb", input_shape=[2,300,300,3]) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model MobileNet.pb --input_shape [2,300,300,3] - - -If a model has multiple inputs, ``input_shape`` must be used in conjunction with ``input`` parameter. -The ``input`` parameter contains a list of input names, for which shapes in the same order are defined via ``input_shape``. -For example, launch model conversion for the ONNX OCR model with a pair of inputs ``data`` and ``seq_len`` -and specify shapes ``[3,150,200,1]`` and ``[3]`` for them: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("ocr.onnx", input=["data","seq_len"], input_shape=[[3,150,200,1],[3]]) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model ocr.onnx --input data,seq_len --input_shape [3,150,200,1],[3] - - -Alternatively, specify input shapes, using the ``input`` parameter as follows: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("ocr.onnx", input=[("data",[3,150,200,1]),("seq_len",[3])]) - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model ocr.onnx --input data[3,150,200,1],seq_len[3] - - -The ``input_shape`` parameter allows overriding original input shapes to ones compatible with a given model. -Dynamic shapes, i.e. with dynamic dimensions, can be replaced in the original model with static shapes for the converted model, and vice versa. -The dynamic dimension can be marked in model conversion API parameter as ``-1`` or ``?``. -For example, launch model conversion for the ONNX OCR model and specify dynamic batch dimension for inputs: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - ov_model = convert_model("ocr.onnx", input=["data","seq_len"], input_shape=[[-1,150,200,1],[-1]] - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model ocr.onnx --input data,seq_len --input_shape [-1,150,200,1],[-1] - - -To optimize memory consumption for models with undefined dimensions in run-time, model conversion API provides the capability to define boundaries of dimensions. -The boundaries of undefined dimension can be specified with ellipsis. -For example, launch model conversion for the ONNX OCR model and specify a boundary for the batch dimension: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: py - :force: - - from openvino.tools.mo import convert_model - from openvino.runtime import Dimension - ov_model = convert_model("ocr.onnx", input=["data","seq_len"], input_shape=[[Dimension(1,3),150,200,1],[Dimension(1,3)]] - - .. tab-item:: CLI - :sync: cli - - .. code-block:: sh - - mo --input_model ocr.onnx --input data,seq_len --input_shape [1..3,150,200,1],[1..3] - - -Practically, some models are not ready for input shapes change. -In this case, a new input shape cannot be set via model conversion API. -For more information about shape follow the :doc:`inference troubleshooting <[legacy]-troubleshooting-reshape-errors>` -and :ref:`ways to relax shape inference flow ` guides. - -Additional Resources -#################### - -* :doc:`Convert a Model <../legacy-conversion-api>` -* :doc:`Cutting Off Parts of a Model <[legacy]-cutting-parts-of-a-model>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats.rst deleted file mode 100644 index fb9f41c755d4fb..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats.rst +++ /dev/null @@ -1,598 +0,0 @@ -[LEGACY] Supported Model Formats -===================================== - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Supported Model Formats <../../../../openvino-workflow/model-preparation>` article. - -.. toctree:: - :maxdepth: 1 - :hidden: - - Converting a TensorFlow Model <[legacy]-supported-model-formats/[legacy]-convert-tensorflow> - Converting an ONNX Model <[legacy]-supported-model-formats/[legacy]-convert-onnx> - Converting a PyTorch Model <[legacy]-supported-model-formats/[legacy]-convert-pytorch> - Converting a TensorFlow Lite Model <[legacy]-supported-model-formats/[legacy]-convert-tensorflow-lite> - Converting a PaddlePaddle Model <[legacy]-supported-model-formats/[legacy]-convert-paddle> - Model Conversion Tutorials <[legacy]-supported-model-formats/[legacy]-conversion-tutorials> - -.. meta:: - :description: Learn about supported model formats and the methods used to convert, read, and compile them in OpenVINO™. - - -**OpenVINO IR (Intermediate Representation)** - the proprietary and default format of OpenVINO, benefiting from the full extent of its features. All other supported model formats, as listed below, are converted to :doc:`OpenVINO IR <../../../openvino-ir-format>` to enable inference. Consider storing your model in this format to minimize first-inference latency, perform model optimization, and, in some cases, save space on your drive. - -**PyTorch, TensorFlow, ONNX, and PaddlePaddle** - can be used with OpenVINO Runtime API directly, -which means you do not need to save them as OpenVINO IR before including them in your application. -OpenVINO can read, compile, and convert them automatically, as part of its pipeline. - -In the Python API, these options are provided as three separate methods: -``read_model()``, ``compile_model()``, and ``convert_model()``. -The ``convert_model()`` method enables you to perform additional adjustments -to the model, such as setting shapes, changing model input types or layouts, -cutting parts of the model, freezing inputs, etc. For a detailed description -of the conversion process, see the -:doc:`model conversion guide <../legacy-conversion-api>`. - -Here are code examples of how to use these methods with different model formats: - -.. tab-set:: - - .. tab-item:: PyTorch - :sync: torch - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - * The ``convert_model()`` method: - - This is the only method applicable to PyTorch models. - - .. dropdown:: List of supported formats: - - * **Python objects**: - - * ``torch.nn.Module`` - * ``torch.jit.ScriptModule`` - * ``torch.jit.ScriptFunction`` - - .. code-block:: py - :force: - - import openvino - import torchvision - from openvino.tools.mo import convert_model - core = openvino.Core() - - model = torchvision.models.resnet50(weights='DEFAULT') - ov_model = convert_model(model) - compiled_model = core.compile_model(ov_model, "AUTO") - - For more details on conversion, refer to the - :doc:`guide <[legacy]-supported-model-formats/[legacy]-convert-pytorch>` - and an example `tutorial `__ - on this topic. - - .. tab-item:: TensorFlow - :sync: tf - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - * The ``convert_model()`` method: - - When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. - - .. dropdown:: List of supported formats: - - * **Files**: - - * SavedModel - ```` or ``.pb`` - * Checkpoint - ``.pb`` or ``.pbtxt`` - * MetaGraph - ``.meta`` - - * **Python objects**: - - * ``tf.keras.Model`` - * ``tf.keras.layers.Layer`` - * ``tf.Module`` - * ``tf.compat.v1.Graph`` - * ``tf.compat.v1.GraphDef`` - * ``tf.function`` - * ``tf.compat.v1.session`` - * ``tf.train.checkpoint`` - - .. code-block:: py - :force: - - import openvino - from openvino.tools.mo import convert_model - - core = openvino.Core() - ov_model = convert_model("saved_model.pb") - compiled_model = core.compile_model(ov_model, "AUTO") - - For more details on conversion, refer to the - :doc:`guide <[legacy]-supported-model-formats/[legacy]-convert-tensorflow>` - and an example `tutorial `__ - on this topic. - - * The ``read_model()`` and ``compile_model()`` methods: - - .. dropdown:: List of supported formats: - - * **Files**: - - * SavedModel - ```` or ``.pb`` - * Checkpoint - ``.pb`` or ``.pbtxt`` - * MetaGraph - ``.meta`` - - .. code-block:: py - :force: - - ov_model = read_model("saved_model.pb") - compiled_model = core.compile_model(ov_model, "AUTO") - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: C++ - :sync: cpp - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * SavedModel - ```` or ``.pb`` - * Checkpoint - ``.pb`` or ``.pbtxt`` - * MetaGraph - ``.meta`` - - .. code-block:: cpp - - ov::CompiledModel compiled_model = core.compile_model("saved_model.pb", "AUTO"); - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: C - :sync: c - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * SavedModel - ```` or ``.pb`` - * Checkpoint - ``.pb`` or ``.pbtxt`` - * MetaGraph - ``.meta`` - - .. code-block:: c - - ov_compiled_model_t* compiled_model = NULL; - ov_core_compile_model_from_file(core, "saved_model.pb", "AUTO", 0, &compiled_model); - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: CLI - :sync: cli - - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. - - .. code-block:: sh - - mo --input_model .pb - - For details on the conversion, refer to the - :doc:`article <[legacy]-supported-model-formats/[legacy]-convert-tensorflow>`. - - .. tab-item:: TensorFlow Lite - :sync: tflite - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - * The ``convert_model()`` method: - - When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.tflite`` - - .. code-block:: py - :force: - - import openvino - from openvino.tools.mo import convert_model - - core = openvino.Core() - ov_model = convert_model(".tflite") - compiled_model = core.compile_model(ov_model, "AUTO") - - For more details on conversion, refer to the - :doc:`guide <[legacy]-supported-model-formats/[legacy]-convert-tensorflow>` - and an example `tutorial `__ - on this topic. - - - * The ``read_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.tflite`` - - .. code-block:: py - :force: - - import openvino - - core = openvino.Core() - ov_model = core.read_model(".tflite") - compiled_model = core.compile_model(ov_model, "AUTO") - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.tflite`` - - .. code-block:: py - :force: - - import openvino - - core = openvino.Core() - compiled_model = core.compile_model(".tflite", "AUTO") - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - - .. tab-item:: C++ - :sync: cpp - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.tflite`` - - .. code-block:: cpp - - ov::CompiledModel compiled_model = core.compile_model(".tflite", "AUTO"); - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: C - :sync: c - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.tflite`` - - .. code-block:: c - - ov_compiled_model_t* compiled_model = NULL; - ov_core_compile_model_from_file(core, ".tflite", "AUTO", 0, &compiled_model); - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: CLI - :sync: cli - - * The ``convert_model()`` method: - - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.tflite`` - - .. code-block:: sh - - mo --input_model .tflite - - For details on the conversion, refer to the - :doc:`article <[legacy]-supported-model-formats/[legacy]-convert-tensorflow-lite>`. - - .. tab-item:: ONNX - :sync: onnx - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - * The ``convert_model()`` method: - - When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.onnx`` - - .. code-block:: py - :force: - - import openvino - from openvino.tools.mo import convert_model - - core = openvino.Core() - ov_model = convert_model(".onnx") - compiled_model = core.compile_model(ov_model, "AUTO") - - For more details on conversion, refer to the - :doc:`guide <[legacy]-supported-model-formats/[legacy]-convert-onnx>` - and an example `tutorial `__ - on this topic. - - - * The ``read_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.onnx`` - - .. code-block:: py - :force: - - import openvino - core = openvino.Core() - - ov_model = core.read_model(".onnx") - compiled_model = core.compile_model(ov_model, "AUTO") - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.onnx`` - - .. code-block:: py - :force: - - import openvino - core = openvino.Core() - - compiled_model = core.compile_model(".onnx", "AUTO") - - For a guide on how to run inference, see how to :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - - .. tab-item:: C++ - :sync: cpp - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.onnx`` - - .. code-block:: cpp - - ov::CompiledModel compiled_model = core.compile_model(".onnx", "AUTO"); - - For a guide on how to run inference, see how to :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: C - :sync: c - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.onnx`` - - .. code-block:: c - - ov_compiled_model_t* compiled_model = NULL; - ov_core_compile_model_from_file(core, ".onnx", "AUTO", 0, &compiled_model); - - For details on the conversion, refer to the :doc:`article <[legacy]-supported-model-formats/[legacy]-convert-onnx>` - - .. tab-item:: CLI - :sync: cli - - * The ``convert_model()`` method: - - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.onnx`` - - .. code-block:: sh - - mo --input_model .onnx - - For details on the conversion, refer to the - :doc:`article <[legacy]-supported-model-formats/[legacy]-convert-onnx>` - - .. tab-item:: PaddlePaddle - :sync: pdpd - - .. tab-set:: - - .. tab-item:: Python - :sync: py - - * The ``convert_model()`` method: - - When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.pdmodel`` - - * **Python objects**: - - * ``paddle.hapi.model.Model`` - * ``paddle.fluid.dygraph.layers.Layer`` - * ``paddle.fluid.executor.Executor`` - - .. code-block:: py - :force: - - import openvino - from openvino.tools.mo import convert_model - - core = openvino.Core() - ov_model = convert_model(".pdmodel") - compiled_model = core.compile_model(ov_model, "AUTO") - - For more details on conversion, refer to the - :doc:`guide <[legacy]-supported-model-formats/[legacy]-convert-paddle>` - and an example `tutorial `__ - on this topic. - - * The ``read_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.pdmodel`` - - .. code-block:: py - :force: - - import openvino - core = openvino.Core() - - ov_model = read_model(".pdmodel") - compiled_model = core.compile_model(ov_model, "AUTO") - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.pdmodel`` - - .. code-block:: py - :force: - - import openvino - core = openvino.Core() - - compiled_model = core.compile_model(".pdmodel", "AUTO") - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: C++ - :sync: cpp - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.pdmodel`` - - .. code-block:: cpp - - ov::CompiledModel compiled_model = core.compile_model(".pdmodel", "AUTO"); - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: C - :sync: c - - * The ``compile_model()`` method: - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.pdmodel`` - - .. code-block:: c - - ov_compiled_model_t* compiled_model = NULL; - ov_core_compile_model_from_file(core, ".pdmodel", "AUTO", 0, &compiled_model); - - For a guide on how to run inference, see how to - :doc:`Integrate OpenVINO™ with Your Application <../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`. - - .. tab-item:: CLI - :sync: cli - - * The ``convert_model()`` method: - - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. - - .. dropdown:: List of supported formats: - - * **Files**: - - * ``.pdmodel`` - - .. code-block:: sh - - mo --input_model .pdmodel - - For details on the conversion, refer to the - :doc:`article <[legacy]-supported-model-formats/[legacy]-convert-paddle>`. - - -As OpenVINO support for **MXNet, Caffe, and Kaldi formats** has been **discontinued**, converting these legacy formats -to OpenVINO IR or ONNX before running inference should be considered the default path for use with OpenVINO. - -.. note:: - - If you want to keep working with the legacy formats the old way, refer to a previous - `OpenVINO LTS version and its documentation `__ . - - OpenVINO versions of 2023 are mostly compatible with the old instructions, - through a deprecated MO tool, installed with the deprecated OpenVINO Developer Tools package. - - `OpenVINO 2023.0 `__ is the last - release officially supporting the MO conversion process for the legacy formats. - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials.rst deleted file mode 100644 index 5fbe486a20960a..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials.rst +++ /dev/null @@ -1,59 +0,0 @@ -[LEGACY] Model Conversion Tutorials -==================================================== - - -.. toctree:: - :maxdepth: 1 - :hidden: - - [legacy]-conversion-tutorials/convert-tensorflow-attention-ocr - [legacy]-conversion-tutorials/convert-tensorflow-bert - [legacy]-conversion-tutorials/convert-tensorflow-crnn - [legacy]-conversion-tutorials/convert-tensorflow-deep-speech - [legacy]-conversion-tutorials/convert-tensorflow-efficient-det - [legacy]-conversion-tutorials/convert-tensorflow-face-net - [legacy]-conversion-tutorials/convert-tensorflow-gnmt - [legacy]-conversion-tutorials/convert-tensorflow-language-1b - [legacy]-conversion-tutorials/convert-tensorflow-ncf - [legacy]-conversion-tutorials/convert-tensorflow-object-detection - [legacy]-conversion-tutorials/convert-tensorflow-retina-net - [legacy]-conversion-tutorials/convert-tensorflow-slim-library - [legacy]-conversion-tutorials/convert-tensorflow-wide-and-deep-family - [legacy]-conversion-tutorials/convert-tensorflow-xlnet - [legacy]-conversion-tutorials/convert-tensorflow-yolo - [legacy]-conversion-tutorials/convert-onnx-faster-r-cnn - [legacy]-conversion-tutorials/convert-onnx-gpt-2 - [legacy]-conversion-tutorials/convert-onnx-mask-r-cnn - [legacy]-conversion-tutorials/convert-pytorch-bert-ner - [legacy]-conversion-tutorials/convert-pytorch-cascade-rcnn-r-101 - [legacy]-conversion-tutorials/convert-pytorch-f3-net - [legacy]-conversion-tutorials/convert-pytorch-quartz-net - [legacy]-conversion-tutorials/convert-pytorch-rcan - [legacy]-conversion-tutorials/convert-pytorch-rnn-t - [legacy]-conversion-tutorials/convert-pytorch-yolact - - -.. meta:: - :description: Get to know conversion methods for specific TensorFlow, ONNX, and PyTorch models. - - -.. danger:: - - The code described in the tutorials has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../learn-openvino/interactive-tutorials-python>`. - -This section provides a set of tutorials that demonstrate conversion methods for specific -TensorFlow, ONNX, and PyTorch models. Note that these instructions do not cover all use -cases and may not reflect your particular needs. -Before studying the tutorials, try to convert the model out-of-the-box by specifying only the -``--input_model`` parameter in the command line. - -.. note:: - - Apache MXNet, Caffe, and Kaldi are no longer directly supported by OpenVINO. - -You will find a collection of :doc:`Python tutorials <../../../../../learn-openvino/interactive-tutorials-python>` written for running on Jupyter notebooks -that provide an introduction to the OpenVINO™ toolkit and explain how to use the Python API and tools for -optimized deep learning inference. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn.rst deleted file mode 100644 index 7880b261c80b81..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn.rst +++ /dev/null @@ -1,41 +0,0 @@ -Converting an ONNX Faster R-CNN Model -===================================== - - -.. meta:: - :description: Learn how to convert a Faster R-CNN model - from ONNX to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -The instructions below are applicable **only** to the Faster R-CNN model converted to the ONNX file format from the `maskrcnn-benchmark model `__: - -1. Download the pretrained model file from `onnx/models `__ (commit-SHA: 8883e49e68de7b43e263d56b9ed156dfa1e03117). - -2. Generate the Intermediate Representation of the model, by changing your current working directory to the model conversion API installation directory, and running model conversion with the following parameters: - - .. code-block:: sh - - mo \ - --input_model FasterRCNN-10.onnx \ - --input_shape [1,3,800,800] \ - --input 0:2 \ - --mean_values [102.9801,115.9465,122.7717] \ - --transformations_config front/onnx/faster_rcnn.json - - - Be aware that the height and width specified with the ``input_shape`` command line parameter - could be different. For more information about supported input image dimensions and - required pre- and post-processing steps, refer to the - `Faster R-CNN article `__. - -3. Interpret the outputs of the generated IR: class indices, probabilities and box coordinates. Below are the outputs from the ``DetectionOutput`` layer: - - * class indices - * probabilities - * box coordinates - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-gpt-2.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-gpt-2.rst deleted file mode 100644 index 4c10c941c7fb47..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-gpt-2.rst +++ /dev/null @@ -1,34 +0,0 @@ -Converting an ONNX GPT-2 Model -============================== - - -.. meta:: - :description: Learn how to convert a pre-trained GPT-2 - model from ONNX to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -`Public pre-trained GPT-2 model `__ is a large -transformer-based language model with a simple objective: predict the next word, given all of the previous words within some text. - -Downloading the Pre-Trained Base GPT-2 Model -############################################ - -To download the model, go to `this model `__, and press **Download**. - -To download the model and sample test data, go to `this model `__, and press **Download**. - -Converting an ONNX GPT-2 Model to IR -#################################### - -Generate the Intermediate Representation of the model GPT-2 by running model conversion with the following parameters: - -.. code-block:: sh - - mo --input_model gpt2-10.onnx --input_shape [X,Y,Z] --output_dir - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-mask-r-cnn.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-mask-r-cnn.rst deleted file mode 100644 index 6158f5bdcb59ed..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-mask-r-cnn.rst +++ /dev/null @@ -1,41 +0,0 @@ -Converting an ONNX Mask R-CNN Model -=================================== - - -.. meta:: - :description: Learn how to convert a pre-trained Mask - R-CNN model from ONNX to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -The instructions below are applicable **only** to the Mask R-CNN model converted to the ONNX file format from the `maskrcnn-benchmark model `__. - -1. Download the pretrained model file from `onnx/models `__ (commit-SHA: 8883e49e68de7b43e263d56b9ed156dfa1e03117). - -2. Generate the Intermediate Representation of the model by changing your current working directory to the model conversion API installation directory and running model conversion with the following parameters: - - .. code-block:: sh - - mo \ - --input_model mask_rcnn_R_50_FPN_1x.onnx \ - --input "0:2" \ - --input_shape [1,3,800,800] \ - --mean_values [102.9801,115.9465,122.7717] \ - --transformations_config front/onnx/mask_rcnn.json - - - Be aware that the height and width specified with the ``input_shape`` command line parameter could be different. For more information about supported input image dimensions and required pre- and post-processing steps, refer to the `documentation `__. - -3. Interpret the outputs of the generated IR file: masks, class indices, probabilities and box coordinates: - - * masks - * class indices - * probabilities - * box coordinates - -The first one is a layer with the name ``6849/sink_port_0``, and rest are outputs from the ``DetectionOutput`` layer. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-bert-ner.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-bert-ner.rst deleted file mode 100644 index e89d21f28c66c4..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-bert-ner.rst +++ /dev/null @@ -1,76 +0,0 @@ -Converting a PyTorch BERT-NER Model -=================================== - - -.. meta:: - :description: Learn how to convert a BERT-NER model - from PyTorch to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -The goal of this article is to present a step-by-step guide on how to convert PyTorch BERT-NER model to OpenVINO IR. First, you need to download the model and convert it to ONNX. - - -Downloading and Converting the Model to ONNX -############################################ - -To download a pretrained model or train the model yourself, refer -to the `instructions `__ in the -BERT-NER model repository. The model with configuration files is stored in the ``out_base`` directory. - -To convert the model to ONNX format, create and run the following script in the root -directory of the model repository. If you download the pretrained model, you need -to download `bert.py `__ to run the script. -The instructions were tested with the commit-SHA: ``e5be564156f194f1becb0d82aeaf6e762d9eb9ed``. - -.. code-block:: py - :force: - - import torch - - from bert import Ner - - ner = Ner("out_base") - - input_ids, input_mask, segment_ids, valid_positions = ner.preprocess('Steve went to Paris') - input_ids = torch.tensor([input_ids], dtype=torch.long, device=ner.device) - input_mask = torch.tensor([input_mask], dtype=torch.long, device=ner.device) - segment_ids = torch.tensor([segment_ids], dtype=torch.long, device=ner.device) - valid_ids = torch.tensor([valid_positions], dtype=torch.long, device=ner.device) - - ner_model, tknizr, model_config = ner.load_model("out_base") - - with torch.no_grad(): - logits = ner_model(input_ids, segment_ids, input_mask, valid_ids) - torch.onnx.export(ner_model, - (input_ids, segment_ids, input_mask, valid_ids), - "bert-ner.onnx", - input_names=['input_ids', 'segment_ids', 'input_mask', 'valid_ids'], - output_names=['output'], - dynamic_axes={ - "input_ids": {0: "batch_size"}, - "segment_ids": {0: "batch_size"}, - "input_mask": {0: "batch_size"}, - "valid_ids": {0: "batch_size"}, - "output": {0: "output"} - }, - opset_version=11, - ) - - -The script generates ONNX model file ``bert-ner.onnx``. - -Converting an ONNX BERT-NER model to IR -####################################### - -.. code-block:: sh - - mo --input_model bert-ner.onnx --input "input_mask[1,128],segment_ids[1,128],input_ids[1,128]" - - -where ``1`` is ``batch_size`` and ``128`` is ``sequence_length``. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-cascade-rcnn-r-101.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-cascade-rcnn-r-101.rst deleted file mode 100644 index a61ca5e79f1c30..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-cascade-rcnn-r-101.rst +++ /dev/null @@ -1,51 +0,0 @@ -Converting a PyTorch Cascade RCNN R-101 Model -============================================= - - -.. meta:: - :description: Learn how to convert a Cascade RCNN R-101 - model from PyTorch to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -The goal of this article is to present a step-by-step guide on how to convert a PyTorch Cascade RCNN R-101 model to OpenVINO IR. First, you need to download the model and convert it to ONNX. - -Downloading and Converting Model to ONNX -######################################## - -* Clone the `repository `__ : - - .. code-block:: sh - - git clone https://github.com/open-mmlab/mmdetection - cd mmdetection - - - .. note:: - - To set up an environment, refer to the `instructions `__. - -* Download the pre-trained `model `__. The model is also available `here `__. - -* To convert the model to ONNX format, use this `script `__. - - .. code-block:: sh - - python3 tools/deployment/pytorch2onnx.py configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth --output-file cascade_rcnn_r101_fpn_1x_coco.onnx - - -The script generates ONNX model file ``cascade_rcnn_r101_fpn_1x_coco.onnx`` in the directory ``tools/deployment/``. If required, specify the model name or output directory, using ``--output-file /.onnx``. - -Converting an ONNX Cascade RCNN R-101 Model to OpenVINO IR -########################################################## - -.. code-block:: sh - - mo --input_model cascade_rcnn_r101_fpn_1x_coco.onnx --mean_values [123.675,116.28,103.53] --scale_values [58.395,57.12,57.375] - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-f3-net.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-f3-net.rst deleted file mode 100644 index d1391cfb1519ba..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-f3-net.rst +++ /dev/null @@ -1,55 +0,0 @@ -Converting a PyTorch F3Net Model -================================ - - -.. meta:: - :description: Learn how to convert a F3Net model - from PyTorch to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -`F3Net `__ : Fusion, Feedback and Focus for Salient Object Detection - -Cloning the F3Net Repository -############################ - -To clone the repository, run the following command: - -.. code-block:: sh - - git clone http://github.com/weijun88/F3Net.git - - -Downloading and Converting the Model to ONNX -############################################ - -To download the pretrained model or train the model yourself, refer to the -`instructions `__ in the F3Net model repository. First, convert the model to ONNX format. Create and run the following Python script in the ``src`` directory of the model repository: - -.. code-block:: py - :force: - - import torch - from dataset import Config - from net import F3Net - - cfg = Config(mode='test', snapshot=) - net = F3Net(cfg) - image = torch.zeros([1, 3, 352, 352]) - torch.onnx.export(net, image, 'f3net.onnx', export_params=True, do_constant_folding=True, opset_version=11) - - -The script generates the ONNX model file ``f3net.onnx``. The model conversion was tested with the commit-SHA: ``eecace3adf1e8946b571a4f4397681252f9dc1b8``. - -Converting an ONNX F3Net Model to IR -#################################### - -.. code-block:: sh - - mo --input_model /f3net.onnx - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-quartz-net.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-quartz-net.rst deleted file mode 100644 index f1ee885dae0b26..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-quartz-net.rst +++ /dev/null @@ -1,61 +0,0 @@ -Converting a PyTorch QuartzNet Model -==================================== - - -.. meta:: - :description: Learn how to convert a QuartzNet model - from PyTorch to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -`NeMo project `__ provides the QuartzNet model. - -Downloading the Pre-trained QuartzNet Model -########################################### - -To download the pre-trained model, refer to the `NeMo Speech Models Catalog `__. -Here are the instructions on how to obtain QuartzNet in ONNX format. - -1. Install the NeMo toolkit, using the `instructions `__. - -2. Run the following code: - - .. code-block:: py - :force: - - import nemo - import nemo.collections.asr as nemo_asr - - quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En") - # Export QuartzNet model to ONNX format - quartznet.decoder.export('decoder_qn.onnx') - quartznet.encoder.export('encoder_qn.onnx') - quartznet.export('qn.onnx') - - - This code produces 3 ONNX model files: ``encoder_qn.onnx``, ``decoder_qn.onnx``, ``qn.onnx``. - They are ``decoder``, ``encoder``, and a combined ``decoder(encoder(x))`` models, respectively. - -Converting an ONNX QuartzNet model to IR -######################################## - -If using a combined model: - -.. code-block:: sh - - mo --input_model /qt.onnx --input_shape [B,64,X] - -If using separate models: - -.. code-block:: sh - - mo --input_model /encoder_qt.onnx --input_shape [B,64,X] - mo --input_model /decoder_qt.onnx --input_shape [B,1024,Y] - - -Where shape is determined by the audio file Mel-Spectrogram length: ``B`` - batch dimension, ``X`` - dimension based on the input length, ``Y`` - determined by encoder output, usually ``X / 2``. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rcan.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rcan.rst deleted file mode 100644 index 7e9fb7b5717cbd..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rcan.rst +++ /dev/null @@ -1,49 +0,0 @@ -Converting a PyTorch RCAN Model -=============================== - - -.. meta:: - :description: Learn how to convert a RCAN model - from PyTorch to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -`RCAN `__ : Image Super-Resolution Using Very Deep Residual Channel Attention Networks - -Downloading and Converting the Model to ONNX -############################################ - -To download the pre-trained model or train the model yourself, refer to the `instruction `__ in the RCAN model repository. First, convert the model to ONNX format. Create and run the script with the following content in the root -directory of the model repository: - -.. code-block:: py - :force: - - from argparse import Namespace - - import torch - - from RCAN_TestCode.code.model.rcan import RCAN - - config = Namespace(n_feats=64, n_resblocks=4, n_resgroups=2, reduction=16, scale=[2], data_train='DIV2K', res_scale=1, - n_colors=3, rgb_range=255) - net = RCAN(config) - net.eval() - dummy_input = torch.randn(1, 3, 360, 640) - torch.onnx.export(net, dummy_input, 'RCAN.onnx') - - -The script generates the ONNX model file ``RCAN.onnx``. More information about model parameters (``n_resblocks``, ``n_resgroups``, and others) and their different values can be found in the model repository. The model conversion was tested with the commit-SHA: ``3339ebc59519c3bb2b5719b87dd36515ec7f3ba7``. - -Converting an ONNX RCAN Model to IR -################################### - -.. code-block:: sh - - mo --input_model RCAN.onnx - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rnn-t.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rnn-t.rst deleted file mode 100644 index ad646568aed598..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rnn-t.rst +++ /dev/null @@ -1,137 +0,0 @@ -Converting a PyTorch RNN-T Model -================================ - - -.. meta:: - :description: Learn how to convert a RNN-T model - from PyTorch to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This guide covers conversion of RNN-T model from `MLCommons `__ repository. Follow -the instructions below to export a PyTorch model into ONNX, before converting it to IR: - -**Step 1**. Clone RNN-T PyTorch implementation from MLCommons repository (revision r1.0). Make a shallow clone to pull -only RNN-T model without full repository. If you already have a full repository, skip this and go to **Step 2**: - -.. code-block:: sh - - git clone -b r1.0 -n https://github.com/mlcommons/inference rnnt_for_openvino --depth 1 - cd rnnt_for_openvino - git checkout HEAD speech_recognition/rnnt - - -**Step 2**. If you already have a full clone of MLCommons inference repository, create a folder for -pretrained PyTorch model, where conversion into IR will take place. You will also need to specify the path to -your full clone at **Step 5**. Skip this step if you have a shallow clone. - -.. code-block:: sh - - mkdir rnnt_for_openvino - cd rnnt_for_openvino - - -**Step 3**. Download pre-trained weights for PyTorch implementation from `here `__. -For UNIX-like systems, you can use ``wget``: - -.. code-block:: sh - - wget https://zenodo.org/record/3662521/files/DistributedDataParallel_1576581068.9962234-epoch-100.pt - - -The link was taken from ``setup.sh`` in the ``speech_recoginitin/rnnt`` subfolder. You will get exactly the same weights as -if you were following the `guide `__. - -**Step 4**. Install required Python packages: - -.. code-block:: sh - - pip3 install torch toml - - -**Step 5**. Export RNN-T model into ONNX, using the script below. Copy the code below into a file named -``export_rnnt_to_onnx.py`` and run it in the current directory ``rnnt_for_openvino``: - -.. note:: - - If you already have a full clone of MLCommons inference repository, you need - to specify the ``mlcommons_inference_path`` variable. - -.. code-block:: py - :force: - - import toml - import torch - import sys - - - def load_and_migrate_checkpoint(ckpt_path): - checkpoint = torch.load(ckpt_path, map_location="cpu") - migrated_state_dict = {} - for key, value in checkpoint['state_dict'].items(): - key = key.replace("joint_net", "joint.net") - migrated_state_dict[key] = value - del migrated_state_dict["audio_preprocessor.featurizer.fb"] - del migrated_state_dict["audio_preprocessor.featurizer.window"] - return migrated_state_dict - - - mlcommons_inference_path = './' # specify relative path for MLCommons inferene - checkpoint_path = 'DistributedDataParallel_1576581068.9962234-epoch-100.pt' - config_toml = 'speech_recognition/rnnt/pytorch/configs/rnnt.toml' - config = toml.load(config_toml) - rnnt_vocab = config['labels']['labels'] - sys.path.insert(0, mlcommons_inference_path + 'speech_recognition/rnnt/pytorch') - - from model_separable_rnnt import RNNT - - model = RNNT(config['rnnt'], len(rnnt_vocab) + 1, feature_config=config['input_eval']) - model.load_state_dict(load_and_migrate_checkpoint(checkpoint_path)) - - seq_length, batch_size, feature_length = 157, 1, 240 - inp = torch.randn([seq_length, batch_size, feature_length]) - feature_length = torch.LongTensor([seq_length]) - x_padded, x_lens = model.encoder(inp, feature_length) - torch.onnx.export(model.encoder, (inp, feature_length), "rnnt_encoder.onnx", opset_version=12, - input_names=['input', 'feature_length'], output_names=['x_padded', 'x_lens'], - dynamic_axes={'input': {0: 'seq_len', 1: 'batch'}}) - - symbol = torch.LongTensor([[20]]) - hidden = torch.randn([2, batch_size, 320]), torch.randn([2, batch_size, 320]) - g, hidden = model.prediction.forward(symbol, hidden) - torch.onnx.export(model.prediction, (symbol, hidden), "rnnt_prediction.onnx", opset_version=12, - input_names=['symbol', 'hidden_in_1', 'hidden_in_2'], - output_names=['g', 'hidden_out_1', 'hidden_out_2'], - dynamic_axes={'symbol': {0: 'batch'}, 'hidden_in_1': {1: 'batch'}, 'hidden_in_2': {1: 'batch'}}) - - f = torch.randn([batch_size, 1, 1024]) - model.joint.forward(f, g) - torch.onnx.export(model.joint, (f, g), "rnnt_joint.onnx", opset_version=12, - input_names=['0', '1'], output_names=['result'], dynamic_axes={'0': {0: 'batch'}, '1': {0: 'batch'}}) - - -.. code-block:: sh - - python3 export_rnnt_to_onnx.py - - -After completing this step, the files ``rnnt_encoder.onnx``, ``rnnt_prediction.onnx``, and ``rnnt_joint.onnx`` will be saved in the current directory. - -**Step 6**. Run the conversion commands: - -.. code-block:: sh - - mo --input_model rnnt_encoder.onnx --input "input[157,1,240],feature_length->157" - mo --input_model rnnt_prediction.onnx --input "symbol[1,1],hidden_in_1[2,1,320],hidden_in_2[2,1,320]" - mo --input_model rnnt_joint.onnx --input "0[1,1,1024],1[1,1,320]" - - -.. note:: - - The hardcoded value for sequence length = 157 was taken from the MLCommons, but conversion to IR preserves network :doc:`reshapeability <../../../../../../openvino-workflow/running-inference/changing-input-shape>`. Therefore, input shapes can be changed manually to any value during either conversion or inference. - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-yolact.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-yolact.rst deleted file mode 100644 index 0eacbd6c5b0bf9..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-yolact.rst +++ /dev/null @@ -1,222 +0,0 @@ -Converting a PyTorch YOLACT Model -================================= - - -.. meta:: - :description: Learn how to convert a YOLACT model - from PyTorch to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -You Only Look At CoefficienTs (YOLACT) is a simple, fully convolutional model for real-time instance segmentation. -The PyTorch implementation is publicly available in `this GitHub repository `__. -The YOLACT++ model is not supported, because it uses deformable convolutional layers that cannot be represented in ONNX format. - -.. _patch-file-yolact: - -Creating a Patch File -##################### - -Before converting the model, create a patch file for the repository. -The patch modifies the framework code by adding a special command-line argument to the framework options. The argument enables inference graph dumping: - -1. Go to a writable directory and create a ``YOLACT_onnx_export.patch`` file. -2. Copy the following diff code to the file: - - .. code-block:: console - - From 76deb67d4f09f29feda1a633358caa18335d9e9f Mon Sep 17 00:00:00 2001 - From: "OpenVINO" - Date: Fri, 12 Mar 2021 00:27:35 +0300 - Subject: [PATCH] Add export to ONNX - - --- - eval.py | 5 ++++- - utils/augmentations.py | 7 +++++-- - yolact.py | 29 +++++++++++++++++++---------- - 3 files changed, 28 insertions(+), 13 deletions(-) - - diff --git a/eval.py b/eval.py - index 547bc0a..bde0680 100644 - --- a/eval.py - +++ b/eval.py - @@ -593,9 +593,12 @@ def badhash(x): - return x - - def evalimage(net:Yolact, path:str, save_path:str=None): - - frame = torch.from_numpy(cv2.imread(path)).cuda().float() - + frame = torch.from_numpy(cv2.imread(path)).float() - + if torch.cuda.is_available(): - + frame = frame.cuda() - batch = FastBaseTransform()(frame.unsqueeze(0)) - preds = net(batch) - + torch.onnx.export(net, batch, "yolact.onnx", opset_version=11) - - img_numpy = prep_display(preds, frame, None, None, undo_transform=False) - - diff --git a/utils/augmentations.py b/utils/augmentations.py - index cc7a73a..2420603 100644 - --- a/utils/augmentations.py - +++ b/utils/augmentations.py - @@ -623,8 +623,11 @@ class FastBaseTransform(torch.nn.Module): - def __init__(self): - super().__init__() - - - self.mean = torch.Tensor(MEANS).float().cuda()[None, :, None, None] - - self.std = torch.Tensor( STD ).float().cuda()[None, :, None, None] - + self.mean = torch.Tensor(MEANS).float()[None, :, None, None] - + self.std = torch.Tensor( STD ).float()[None, :, None, None] - + if torch.cuda.is_available(): - + self.mean.cuda() - + self.std.cuda() - self.transform = cfg.backbone.transform - - def forward(self, img): - diff --git a/yolact.py b/yolact.py - index d83703b..f8c787c 100644 - --- a/yolact.py - +++ b/yolact.py - @@ -17,19 +17,22 @@ import torch.backends.cudnn as cudnn - from utils import timer - from utils.functions import MovingAverage, make_net - - -# This is required for Pytorch 1.0.1 on Windows to initialize Cuda on some driver versions. - -# See the bug report here: https://github.com/pytorch/pytorch/issues/17108 - -torch.cuda.current_device() - - - -# As of March 10, 2019, Pytorch DataParallel still doesn't support JIT Script Modules - -use_jit = torch.cuda.device_count() <= 1 - -if not use_jit: - - print('Multiple GPUs detected! Turning off JIT.') - +use_jit = False - - ScriptModuleWrapper = torch.jit.ScriptModule if use_jit else nn.Module - script_method_wrapper = torch.jit.script_method if use_jit else lambda fn, _rcn=None: fn - - - +def decode(loc, priors): - + variances = [0.1, 0.2] - + boxes = torch.cat((priors[:, :2] + loc[:, :, :2] * variances[0] * priors[:, 2:], priors[:, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2) - + - + boxes_result1 = boxes[:, :, :2] - boxes[:, :, 2:] / 2 - + boxes_result2 = boxes[:, :, 2:] + boxes_result1 - + boxes_result = torch.cat((boxes_result1, boxes_result2), 2) - + - + return boxes_result - + - - class Concat(nn.Module): - def __init__(self, nets, extra_params): - @@ -476,7 +479,10 @@ class Yolact(nn.Module): - - def load_weights(self, path): - """ Loads weights from a compressed save file. """ - - state_dict = torch.load(path) - + if torch.cuda.is_available(): - + state_dict = torch.load(path) - + else: - + state_dict = torch.load(path, map_location=torch.device('cpu')) - - # For backward compatibility, remove these (the new variable is called layers) - for key in list(state_dict.keys()): - @@ -673,8 +679,11 @@ class Yolact(nn.Module): - else: - pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) - - - return self.detect(pred_outs, self) - + pred_outs['boxes'] = decode(pred_outs['loc'], pred_outs['priors']) # decode output boxes - - + pred_outs.pop('priors') # remove unused in postprocessing layers - + pred_outs.pop('loc') # remove unused in postprocessing layers - + return pred_outs - - - - -- - - -3. Save and close the file. - -Converting a YOLACT Model to the OpenVINO IR format -################################################### - -**Step 1**. Clone the GitHub repository and check out the commit: - -1. Clone the YOLACT repository: - - .. code-block:: sh - - git clone https://github.com/dbolya/yolact - - -2. Check out the necessary commit: - - .. code-block:: sh - - git checkout 57b8f2d95e62e2e649b382f516ab41f949b57239 - - -3. Set up the environment as described in ``README.md``. - -**Step 2**. Download a pre-trained model from the list attached in the ``Evaluation`` section of ``README.md`` document, for example ``yolact_base_54_800000.pth``. - -**Step 3**. Export the model to ONNX format. - -1. Apply the `YOLACT_onnx_export.patch` patch to the repository. Refer to the :ref:`Create a Patch File ` instructions if you do not have it: - - .. code-block:: sh - - git apply /path/to/patch/YOLACT_onnx_export.patch - - -2. Evaluate the YOLACT model to export it to ONNX format: - - .. code-block:: sh - - python3 eval.py \ - --trained_model=/path/to/yolact_base_54_800000.pth \ - --score_threshold=0.3 \ - --top_k=10 \ - --image=/path/to/image.jpg \ - --cuda=False - - -3. The script may fail, but you should get ``yolact.onnx`` file. - -**Step 4**. Convert the model to the IR: - -.. code-block:: sh - - mo --input_model /path/to/yolact.onnx - - -**Step 5**. Embed input preprocessing into the IR: - -To get performance gain by offloading to the OpenVINO application of mean/scale values and RGB->BGR conversion, use the following model conversion API parameters: - -* If the backbone of the model is Resnet50-FPN or Resnet101-FPN, use the following MO command line: - - .. code-block:: sh - - mo \ - --input_model /path/to/yolact.onnx \ - --reverse_input_channels \ - --mean_values "[123.68, 116.78, 103.94]" \ - --scale_values "[58.40, 57.12, 57.38]" - - -* If the backbone of the model is Darknet53-FPN, use the following MO command line: - - .. code-block:: sh - - mo \ - --input_model /path/to/yolact.onnx \ - --reverse_input_channels \ - --scale 255 - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-attention-ocr.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-attention-ocr.rst deleted file mode 100644 index dd419456ccbcd3..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-attention-ocr.rst +++ /dev/null @@ -1,60 +0,0 @@ -Converting a TensorFlow Attention OCR Model -=========================================== - - -.. meta:: - :description: Learn how to convert the Attention OCR - model from the TensorFlow Attention OCR repository to the - OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This tutorial explains how to convert the Attention OCR (AOCR) model from the `TensorFlow Attention OCR repository `__ to the Intermediate Representation (IR). - -Extracting a Model from ``aocr`` Library -######################################## - -To get an AOCR model, download ``aocr`` Python library: - -.. code-block:: sh - - pip install git+https://github.com/emedvedev/attention-ocr.git@master#egg=aocr - -This library contains a pretrained model and allows training and running AOCR, using the command line. After installation of `aocr`, extract the model: - -.. code-block:: sh - - aocr export --format=frozengraph model/path/ - -Once extracted, the model can be found in ``model/path/`` folder. - -Converting the TensorFlow AOCR Model to IR -########################################## - -The original AOCR model includes the preprocessing data, which contains: - -* Decoding input data to binary format where input data is an image represented as a string. -* Resizing binary image to working resolution. - -The resized image is sent to the convolution neural network (CNN). Because model conversion API does not support image decoding, the preprocessing part of the model should be cut off, using the ``input`` command-line parameter. - -.. code-block:: sh - - mo \ - --input_model=model/path/frozen_graph.pb \ - --input="map/TensorArrayStack/TensorArrayGatherV3:0[1,32,86,1]" \ - --output "transpose_1,transpose_2" \ - --output_dir path/to/ir/ - - -Where: - -* ``map/TensorArrayStack/TensorArrayGatherV3:0[1 32 86 1]`` - name of node producing tensor after preprocessing. -* ``transpose_1`` - name of the node producing tensor with predicted characters. -* ``transpose_2`` - name of the node producing tensor with predicted characters probabilities. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-bert.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-bert.rst deleted file mode 100644 index 197b6e13c4e27a..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-bert.rst +++ /dev/null @@ -1,170 +0,0 @@ -Converting a TensorFlow BERT Model -================================== - - -.. meta:: - :description: Learn how to convert a BERT model - from TensorFlow to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -Pretrained models for BERT (Bidirectional Encoder Representations from Transformers) are -`publicly available `__. - -.. _supported_models: - -Supported Models -################ - -The following models from the pretrained `BERT model list `__ are currently supported: - -* ``BERT-Base, Cased`` -* ``BERT-Base, Uncased`` -* ``BERT-Base, Multilingual Cased`` -* ``BERT-Base, Multilingual Uncased`` -* ``BERT-Base, Chinese`` -* ``BERT-Large, Cased`` -* ``BERT-Large, Uncased`` - -Downloading the Pretrained BERT Model -##################################### - -Download and unzip an archive with the `BERT-Base, Multilingual Uncased Model `__. - -After the archive is unzipped, the directory ``uncased_L-12_H-768_A-12`` is created and contains the following files: - -* ``bert_config.json`` -* ``bert_model.ckpt.data-00000-of-00001`` -* ``bert_model.ckpt.index`` -* ``bert_model.ckpt.meta`` -* ``vocab.txt`` - -Pretrained model meta-graph files are ``bert_model.ckpt.*``. - -Converting a TensorFlow BERT Model to IR -######################################### - -To generate the BERT Intermediate Representation (IR) of the model, run model conversion with the following parameters: - -.. code-block:: sh - - mo \ - --input_meta_graph uncased_L-12_H-768_A-12/bert_model.ckpt.meta \ - --output bert/pooler/dense/Tanh \ - --input Placeholder{i32},Placeholder_1{i32},Placeholder_2{i32} - - -Pretrained models are not suitable for batch reshaping out-of-the-box because of multiple hardcoded shapes in the model. - -Converting a Reshapable TensorFlow BERT Model to OpenVINO IR -============================================================= - -Follow these steps to make a pretrained TensorFlow BERT model reshapable over batch dimension: - -1. Download a pretrained BERT model you want to use from the `Supported Models list <#supported_models>`__. - -2. Clone google-research/bert git repository: - - .. code-block:: sh - - https://github.com/google-research/bert.git - -3. Go to the root directory of the cloned repository: - - .. code-block:: sh - - cd bert - -4. (Optional) Checkout to the commit that the conversion was tested on: - - .. code-block:: sh - - git checkout eedf5716c - -5. Download script to load GLUE data: - - * For UNIX-like systems, run the following command: - - .. code-block:: sh - - wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py - - * For Windows systems: - - Download the `Python script `__ to the current working directory. - -6. Download GLUE data by running: - - .. code-block:: sh - - python3 download_glue_data.py --tasks MRPC - -7. Open the file ``modeling.py`` in the text editor and delete lines 923-924. They should look like this: - - .. code-block:: py - :force: - - if not non_static_indexes: - return shape - -8. Open the file ``run_classifier.py`` and insert the following code after the line 645: - - .. code-block:: py - :force: - - import os, sys - import tensorflow as tf - from tensorflow.python.framework import graph_io - with tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph()) as sess: - (assignment_map, initialized_variable_names) = \ - modeling.get_assignment_map_from_checkpoint(tf.compat.v1.trainable_variables(), init_checkpoint) - tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map) - sess.run(tf.compat.v1.global_variables_initializer()) - frozen = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ["bert/pooler/dense/Tanh"]) - graph_io.write_graph(frozen, './', 'inference_graph.pb', as_text=False) - print('BERT frozen model path {}'.format(os.path.join(os.path.dirname(__file__), 'inference_graph.pb'))) - sys.exit(0) - - Lines before the inserted code should look like this: - - .. code-block:: py - :force: - - (total_loss, per_example_loss, logits, probabilities) = create_model( - bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, - num_labels, use_one_hot_embeddings) - - -9. Set environment variables ``BERT_BASE_DIR``, ``BERT_REPO_DIR`` and run the script ``run_classifier.py`` to create ``inference_graph.pb`` file in the root of the cloned BERT repository. - - .. code-block:: sh - - export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 - export BERT_REPO_DIR=/current/working/directory - - python3 run_classifier.py \ - --task_name=MRPC \ - --do_eval=true \ - --data_dir=$BERT_REPO_DIR/glue_data/MRPC \ - --vocab_file=$BERT_BASE_DIR/vocab.txt \ - --bert_config_file=$BERT_BASE_DIR/bert_config.json \ - --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ - --output_dir=./ - - Run model conversion with the following command line parameters to generate reshape-able BERT Intermediate Representation (IR): - - .. code-block:: sh - - mo \ - --input_model inference_graph.pb \ - --input "IteratorGetNext:0{i32}[1,128],IteratorGetNext:1{i32}[1,128],IteratorGetNext:4{i32}[1,128]" - -For other applicable parameters, refer to the :doc:`Convert Model from TensorFlow <../[legacy]-convert-tensorflow>` guide. - -For more information about reshape abilities, refer to the :doc:`Using Shape Inference <../../../../../../openvino-workflow/running-inference/changing-input-shape>` guide. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-crnn.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-crnn.rst deleted file mode 100644 index a94d72b4508f3c..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-crnn.rst +++ /dev/null @@ -1,86 +0,0 @@ -Converting a TensorFlow CRNN Model -================================== - - -.. meta:: - :description: Learn how to convert a CRNN model - from TensorFlow to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This tutorial explains how to convert a CRNN model to OpenVINO™ Intermediate Representation (IR). - -There are several public versions of TensorFlow CRNN model implementation available on GitHub. This tutorial explains how to convert the model from -the `CRNN Tensorflow `__ repository to IR, and is validated with Python 3.7, TensorFlow 1.15.0, and protobuf 3.19.0. -If you have another implementation of CRNN model, it can be converted to OpenVINO IR in a similar way. You need to get inference graph and run model conversion of it. - -**To convert the model to IR:** - -**Step 1.** Clone this GitHub repository and check out the commit: - -1. Clone the repository: - - .. code-block:: sh - - git clone https://github.com/MaybeShewill-CV/CRNN_Tensorflow.git - -2. Go to the ``CRNN_Tensorflow`` directory of the cloned repository: - - .. code-block:: sh - - cd path/to/CRNN_Tensorflow - -3. Check out the necessary commit: - - .. code-block:: sh - - git checkout 64f1f1867bffaacfeacc7a80eebf5834a5726122 - - -**Step 2.** Train the model using the framework or the pretrained checkpoint provided in this repository. - - -**Step 3.** Create an inference graph: - -1. Add the ``CRNN_Tensorflow`` folder to ``PYTHONPATH``. - - * For Linux: - - .. code-block:: sh - - export PYTHONPATH="${PYTHONPATH}:/path/to/CRNN_Tensorflow/" - - - * For Windows, add ``/path/to/CRNN_Tensorflow/`` to the ``PYTHONPATH`` environment variable in settings. - -2. Edit the ``tools/demo_shadownet.py`` script. After ``saver.restore(sess=sess, save_path=weights_path)`` line, add the following code: - - .. code-block:: py - :force: - - from tensorflow.python.framework import graph_io - frozen = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['shadow/LSTMLayers/transpose_time_major']) - graph_io.write_graph(frozen, '.', 'frozen_graph.pb', as_text=False) - -3. Run the demo with the following command: - - .. code-block:: sh - - python tools/demo_shadownet.py --image_path data/test_images/test_01.jpg --weights_path model/shadownet/shadownet_2017-10-17-11-47-46.ckpt-199999 - - - If you want to use your checkpoint, replace the path in the ``--weights_path`` parameter with a path to your checkpoint. - -4. In the ``CRNN_Tensorflow`` directory, you will find the inference CRNN graph ``frozen_graph.pb``. You can use this graph with OpenVINO to convert the model to IR and then run inference. - -**Step 4.** Convert the model to IR: - -.. code-block:: sh - - mo --input_model path/to/your/CRNN_Tensorflow/frozen_graph.pb - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst deleted file mode 100644 index e572b26324faf3..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-deep-speech.rst +++ /dev/null @@ -1,108 +0,0 @@ -Converting a TensorFlow DeepSpeech Model -======================================== - - -.. meta:: - :description: Learn how to convert a DeepSpeech model - from TensorFlow to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -`DeepSpeech project `__ provides an engine to train speech-to-text models. - -Downloading the Pretrained DeepSpeech Model -########################################### - -Create a directory where model and metagraph with pretrained weights will be stored: - -.. code-block:: sh - - mkdir deepspeech - cd deepspeech - -`Pre-trained English speech-to-text model `__ is publicly available. -To download the model, follow the instruction below: - -* For UNIX-like systems, run the following command: - - .. code-block:: sh - - wget -O - https://github.com/mozilla/DeepSpeech/archive/v0.8.2.tar.gz | tar xvfz - - wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.8.2/deepspeech-0.8.2-checkpoint.tar.gz | tar xvfz - - -* For Windows systems: - - 1. Download `the archive with the model `__. - 2. Download the `TensorFlow MetaGraph with pre-trained weights `__. - 3. Unpack it with a file archiver application. - -Freezing the Model into a "\*.pb File" -###################################### - -After unpacking the archives above, you have to freeze the model. This requires -TensorFlow version 1, which is not available under Python 3.8, so you need Python 3.7 or lower. -Before freezing, deploy a virtual environment and install the required packages: - -.. code-block:: sh - - virtualenv --python=python3.7 venv-deep-speech - source venv-deep-speech/bin/activate - cd DeepSpeech-0.8.2 - pip3 install -e . - -Freeze the model with the following command: - -.. code-block:: sh - - python3 DeepSpeech.py --checkpoint_dir ../deepspeech-0.8.2-checkpoint --export_dir ../ - -After that, you will get the pretrained frozen model file ``output_graph.pb`` in the directory ``deepspeech`` created at -the beginning. The model contains the preprocessing and main parts. The first preprocessing part performs conversion of input -spectrogram into a form useful for speech recognition (mel). This part of the model is not convertible into -the IR because it contains unsupported operations ``AudioSpectrogram`` and ``Mfcc``. - -The main and most computationally expensive part of the model converts the preprocessed audio into text. -There are two specificities with the supported part of the model. - -The first is that the model contains an input with sequence length. So the model can be converted with -a fixed input length shape, thus the model is not reshapable. -Refer to the :doc:`Using Shape Inference <../../../../../../openvino-workflow/running-inference/changing-input-shape>` guide. - -The second is that the frozen model still has two variables: ``previous_state_c`` and ``previous_state_h``, figure -with the frozen \*.pb model is below. It means that the model keeps training these variables at each inference. - -.. image:: ../../../../../../assets/images/DeepSpeech-0.8.2.png - -At the first inference, the variables are initialized with zero tensors. After execution, the results of the ``BlockLSTM`` -are assigned to cell state and hidden state, which are these two variables. - -Converting the Main Part of DeepSpeech Model into OpenVINO IR -############################################################# - -Model conversion API assumes that the output model is for inference only. That is why you should cut ``previous_state_c`` and ``previous_state_h`` variables off and resolve keeping cell and hidden states on the application level. - -There are certain limitations for the model conversion: - -* Time length (``time_len``) and sequence length (``seq_len``) are equal. -* Original model cannot be reshaped, so you should keep original shapes. - -To generate the IR, run model conversion with the following parameters: - -.. code-block:: sh - - mo \ - --input_model output_graph.pb \ - --input "input_lengths->[16],input_node[1,16,19,26],previous_state_h[1,2048],previous_state_c[1,2048]" \ - --output "cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/GatherNd_1,cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/GatherNd,logits" - - -Where: - -* ``input_lengths->[16]`` Replaces the input node with name "input_lengths" with a constant tensor of shape [1] with a single integer value of 16. This means that the model now can consume input sequences of length 16 only. -* ``input_node[1 16 19 26],previous_state_h[1 2048],previous_state_c[1 2048]`` replaces the variables with a placeholder. -* ``output ".../GatherNd_1,.../GatherNd,logits"`` output node names. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-efficient-det.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-efficient-det.rst deleted file mode 100644 index c894765a5dc604..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-efficient-det.rst +++ /dev/null @@ -1,90 +0,0 @@ -Converting TensorFlow EfficientDet Models -========================================= - - -.. meta:: - :description: Learn how to convert an EfficientDet model - from TensorFlow to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This tutorial explains how to convert EfficientDet public object detection models to the Intermediate Representation (IR). - -.. _efficientdet-to-ir: - -Converting EfficientDet Model to the IR -####################################### - -There are several public versions of EfficientDet model implementation available on GitHub. This tutorial explains how to -convert models from the `repository `__ (commit 96e1fee) to the OpenVINO format. - -Download and extract the model checkpoint `efficientdet-d4.tar.gz `__ -referenced in the **"Pretrained EfficientDet Checkpoints"** section of the model repository: - -.. code-block:: sh - - wget https://storage.googleapis.com/cloud-tpu-checkpoints/efficientdet/coco2/efficientdet-d4.tar.gz - tar zxvf efficientdet-d4.tar.gz - -Converting an EfficientDet TensorFlow Model to the IR -+++++++++++++++++++++++++++++++++++++++++++++++++++++ - -To generate the IR of the EfficientDet TensorFlow model, run: - -.. code-block:: sh - - mo \ - --input_meta_graph efficientdet-d4/model.meta \ - --input_shape [1,$IMAGE_SIZE,$IMAGE_SIZE,3] \ - --reverse_input_channels - - -Where ``$IMAGE_SIZE`` is the size that the input image of the original TensorFlow model will be resized to. Different -EfficientDet models were trained with different input image sizes. To determine the right one, refer to the ``efficientdet_model_param_dict`` -dictionary in the `hparams_config.py `__ file. -The attribute ``image_size`` specifies the shape to be defined for the model conversion. - -.. note:: - - The color channel order (RGB or BGR) of an input data should match the channel order of the model training dataset. If they are different, perform the ``RGB<->BGR`` conversion specifying the command-line parameter: ``--reverse_input_channels``. Otherwise, inference results may be incorrect. For more information about the parameter, refer to the **When to Reverse Input Channels** section of the :doc:`Converting a Model to Intermediate Representation (IR) <../../[legacy]-setting-input-shapes>` guide. - -OpenVINO toolkit provides samples that can be used to infer EfficientDet model. -For more information, refer to the `Open Model Zoo Demos `__. - -.. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format have are now - published on `Hugging Face `__. - - -Interpreting Results of the TensorFlow Model and the IR -####################################################### - -The TensorFlow model produces as output a list of 7-element tuples: ``[image_id, y_min, x_min, y_max, x_max, confidence, class_id]``, where: - -* ``image_id`` -- image batch index. -* ``y_min`` -- absolute ``y`` coordinate of the lower left corner of the detected object. -* ``x_min`` -- absolute ``x`` coordinate of the lower left corner of the detected object. -* ``y_max`` -- absolute ``y`` coordinate of the upper right corner of the detected object. -* ``x_max`` -- absolute ``x`` coordinate of the upper right corner of the detected object. -* ``confidence`` -- the confidence of the detected object. -* ``class_id`` -- the id of the detected object class counted from 1. - -The output of the IR is a list of 7-element tuples: ``[image_id, class_id, confidence, x_min, y_min, x_max, y_max]``, where: - -* ``image_id`` -- image batch index. -* ``class_id`` -- the id of the detected object class counted from 0. -* ``confidence`` -- the confidence of the detected object. -* ``x_min`` -- normalized ``x`` coordinate of the lower left corner of the detected object. -* ``y_min`` -- normalized ``y`` coordinate of the lower left corner of the detected object. -* ``x_max`` -- normalized ``x`` coordinate of the upper right corner of the detected object. -* ``y_max`` -- normalized ``y`` coordinate of the upper right corner of the detected object. - -The first element with ``image_id = -1`` means end of data. - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst deleted file mode 100644 index a528718349f717..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-face-net.rst +++ /dev/null @@ -1,42 +0,0 @@ -Converting TensorFlow FaceNet Models -==================================== - - -.. meta:: - :description: Learn how to convert a FaceNet model - from TensorFlow to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Supported Model Formats <../../../../../../openvino-workflow/model-preparation>` article. - -`Public pre-trained FaceNet models `__ contain both training -and inference part of graph. Switch between this two states is manageable with placeholder value. -Intermediate Representation (IR) models are intended for inference, which means that train part is redundant. - -There are two inputs in this network: boolean ``phase_train`` which manages state of the graph (train/infer) and -``batch_size`` which is a part of batch joining pattern. - -.. image:: ../../../../../../assets/images/FaceNet.svg - -Converting a TensorFlow FaceNet Model to the IR -############################################### - -To generate a FaceNet OpenVINO model, feed a TensorFlow FaceNet model to model conversion API with the following parameters: - -.. code-block:: sh - - mo - --input_model path_to_model/model_name.pb \ - --freeze_placeholder_with_value "phase_train->False" - - -The batch joining pattern transforms to a placeholder with the model default shape if ``--input_shape`` or ``--batch``/``-b`` are not provided. Otherwise, the placeholder shape has custom parameters. - -* ``freeze_placeholder_with_value "phase_train->False"`` to switch graph to inference mode -* ``batch`*/*`-b`` is applicable to override original network batch -* ``input_shape`` is applicable with or without ``input`` -* other options are applicable - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-gnmt.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-gnmt.rst deleted file mode 100644 index b8d2c592ed931d..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-gnmt.rst +++ /dev/null @@ -1,315 +0,0 @@ -Converting a TensorFlow GNMT Model -================================== - - -.. meta:: - :description: Learn how to convert a GNMT model - from TensorFlow to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This tutorial explains how to convert Google Neural Machine Translation (GNMT) model to the Intermediate Representation (IR). - -There are several public versions of TensorFlow GNMT model implementation available on GitHub. This tutorial explains how to convert the GNMT model from the `TensorFlow Neural Machine Translation (NMT) repository `__ to the IR. - -Creating a Patch File -##################### - -Before converting the model, you need to create a patch file for the repository. The patch modifies the framework code by adding a special command-line argument to the framework options that enables inference graph dumping: - -1. Go to a writable directory and create a ``GNMT_inference.patch`` file. -2. Copy the following diff code to the file: - - .. code-block:: py - - diff --git a/nmt/inference.py b/nmt/inference.py - index 2cbef07..e185490 100644 - --- a/nmt/inference.py - +++ b/nmt/inference.py - @@ -17,9 +17,11 @@ - from __future__ import print_function - - import codecs - +import os - import time - - import tensorflow as tf - +from tensorflow.python.framework import graph_io - - from . import attention_model - from . import gnmt_model - @@ -105,6 +107,29 @@ def start_sess_and_load_model(infer_model, ckpt_path): - return sess, loaded_infer_model - - - +def inference_dump_graph(ckpt_path, path_to_dump, hparams, scope=None): - + model_creator = get_model_creator(hparams) - + infer_model = model_helper.create_infer_model(model_creator, hparams, scope) - + sess = tf.Session( - + graph=infer_model.graph, config=utils.get_config_proto()) - + with infer_model.graph.as_default(): - + loaded_infer_model = model_helper.load_model( - + infer_model.model, ckpt_path, sess, "infer") - + utils.print_out("Dumping inference graph to {}".format(path_to_dump)) - + loaded_infer_model.saver.save( - + sess, - + os.path.join(path_to_dump + 'inference_GNMT_graph') - + ) - + utils.print_out("Dumping done!") - + - + output_node_name = 'index_to_string_Lookup' - + utils.print_out("Freezing GNMT graph with output node {}...".format(output_node_name)) - + frozen = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, - + [output_node_name]) - + graph_io.write_graph(frozen, '.', os.path.join(path_to_dump, 'frozen_GNMT_inference_graph.pb'), as_text=False) - + utils.print_out("Freezing done. Freezed model frozen_GNMT_inference_graph.pb saved to {}".format(path_to_dump)) - + - + - def inference(ckpt_path, - inference_input_file, - inference_output_file, - diff --git a/nmt/nmt.py b/nmt/nmt.py - index f5823d8..a733748 100644 - --- a/nmt/nmt.py - +++ b/nmt/nmt.py - @@ -310,6 +310,13 @@ def add_arguments(parser): - parser.add_argument("--num_intra_threads", type=int, default=0, - help="number of intra_op_parallelism_threads") - - + # Special argument for inference model dumping without inference - + parser.add_argument("--dump_inference_model", type="bool", nargs="?", - + const=True, default=False, - + help="Argument for dump inference graph for specified trained ckpt") - + - + parser.add_argument("--path_to_dump", type=str, default="", - + help="Path to dump inference graph.") - - def create_hparams(flags): - """Create training hparams.""" - @@ -396,6 +403,9 @@ def create_hparams(flags): - language_model=flags.language_model, - num_intra_threads=flags.num_intra_threads, - num_inter_threads=flags.num_inter_threads, - + - + dump_inference_model=flags.dump_inference_model, - + path_to_dump=flags.path_to_dump, - ) - - - @@ -613,7 +623,7 @@ def create_or_load_hparams( - return hparams - - - -def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""): - +def run_main(flags, default_hparams, train_fn, inference_fn, inference_dump, target_session=""): - """Run main.""" - # Job - jobid = flags.jobid - @@ -653,8 +663,26 @@ def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""): - out_dir, default_hparams, flags.hparams_path, - save_hparams=(jobid == 0)) - - - ## Train / Decode - - if flags.inference_input_file: - + # Dumping inference model - + if flags.dump_inference_model: - + # Inference indices - + hparams.inference_indices = None - + if flags.inference_list: - + (hparams.inference_indices) = ( - + [int(token) for token in flags.inference_list.split(",")]) - + - + # Ckpt - + ckpt = flags.ckpt - + if not ckpt: - + ckpt = tf.train.latest_checkpoint(out_dir) - + - + # Path to dump graph - + assert flags.path_to_dump != "", "Please, specify path_to_dump model." - + path_to_dump = flags.path_to_dump - + if not tf.gfile.Exists(path_to_dump): tf.gfile.MakeDirs(path_to_dump) - + - + inference_dump(ckpt, path_to_dump, hparams) - + elif flags.inference_input_file: - # Inference output directory - trans_file = flags.inference_output_file - assert trans_file - @@ -693,7 +721,8 @@ def main(unused_argv): - default_hparams = create_hparams(FLAGS) - train_fn = train.train - inference_fn = inference.inference - - run_main(FLAGS, default_hparams, train_fn, inference_fn) - + inference_dump = inference.inference_dump_graph - + run_main(FLAGS, default_hparams, train_fn, inference_fn, inference_dump) - - - if __name__ == "__main__": - - -3. Save and close the file. - -Converting a GNMT Model to the IR -################################# - -.. note:: Use TensorFlow version 1.13 or lower. - -**Step 1**. Clone the GitHub repository and check out the commit: - -1. Clone the NMT repository: - - .. code-block:: sh - - git clone https://github.com/tensorflow/nmt.git - -2. Check out the necessary commit: - - .. code-block:: sh - - git checkout b278487980832417ad8ac701c672b5c3dc7fa553 - - -**Step 2**. Get a trained model. You have two options: - -* Train the model with the GNMT ``wmt16_gnmt_4_layer.json`` or ``wmt16_gnmt_8_layer.json`` configuration file using the NMT framework. -* *Do not use the pre-trained checkpoints provided in the NMT repository, as they are outdated and can be incompatible with the current repository version.* - -This tutorial assumes the use of the trained GNMT model from ``wmt16_gnmt_4_layer.json`` config, German to English translation. - -**Step 3**. Create an inference graph: - -The OpenVINO assumes that a model is used for inference only. Hence, before converting the model into the IR, you need to transform the training graph into the inference graph. -For the GNMT model, the training graph and the inference graph have different decoders: the training graph uses a greedy search decoding algorithm, while the inference graph uses a beam search decoding algorithm. - -1. Apply the ``GNMT_inference.patch`` patch to the repository. `Create a Patch File <#Creating-a-Patch-File>`__ instructions if you do not have it: - - .. code-block:: sh - - git apply /path/to/patch/GNMT_inference.patch - - -2. Run the NMT framework to dump the inference model: - - .. code-block:: sh - - python -m nmt.nmt - --src=de - --tgt=en - --ckpt=/path/to/ckpt/translate.ckpt - --hparams_path=/path/to/repository/nmt/nmt/standard_hparams/wmt16_gnmt_4_layer.json - --vocab_prefix=/path/to/vocab/vocab.bpe.32000 - --out_dir="" - --dump_inference_model - --infer_mode beam_search - --path_to_dump /path/to/dump/model/ - - -If you use different checkpoints, use the corresponding values for the ``src``, ``tgt``, ``ckpt``, ``hparams_path``, and ``vocab_prefix`` parameters. -Inference checkpoint ``inference_GNMT_graph`` and frozen inference graph ``frozen_GNMT_inference_graph.pb`` will appear in the ``/path/to/dump/model/`` folder. - -To generate ``vocab.bpe.32000``, execute the ``nmt/scripts/wmt16_en_de.sh`` script. If you face an issue of a size mismatch between the checkpoint graph's embedding layer and vocabulary (both src and target), make sure you add the following code to the ``nmt.py`` file to the ``extend_hparams`` function after the line 508 (after initialization of the ``src_vocab_size`` and ``tgt_vocab_size`` variables): - -.. code-block:: py - :force: - - src_vocab_size -= 1 - tgt_vocab_size -= 1 - - -**Step 4**. Convert the model to the IR: - -.. code-block:: sh - - mo - --input_model /path/to/dump/model/frozen_GNMT_inference_graph.pb - --input "IteratorGetNext:1{i32}[1],IteratorGetNext:0{i32}[1,50],dynamic_seq2seq/hash_table_Lookup_1:0[1]->[2],dynamic_seq2seq/hash_table_Lookup:0[1]->[1]" - --output dynamic_seq2seq/decoder/decoder/GatherTree - --output_dir /path/to/output/IR/ - - -Input and output cutting with the ``--input`` and ``--output`` options is required since OpenVINO™ does not support ``IteratorGetNext`` and ``LookupTableFindV2`` operations. - -Input cutting: - -* ``IteratorGetNext`` operation iterates over a dataset. It is cut by output ports: port 0 contains data tensor with shape ``[batch_size, max_sequence_length]``, port 1 contains ``sequence_length`` for every batch with shape ``[batch_size]``. - -* ``LookupTableFindV2`` operations (``dynamic_seq2seq/hash_table_Lookup_1`` and ``dynamic_seq2seq/hash_table_Lookup`` nodes in the graph) are cut with constant values). - -Output cutting: - -* ``LookupTableFindV2`` operation is cut from the output and the ``dynamic_seq2seq/decoder/decoder/GatherTree`` node is treated as a new exit point. - -For more information about model cutting, refer to the :doc:`Cutting Off Parts of a Model <../../[legacy]-cutting-parts-of-a-model>` guide. - -Using a GNMT Model -################## - -.. note:: - - This step assumes you have converted a model to the Intermediate Representation. - -Inputs of the model: - -* ``IteratorGetNext/placeholder_out_port_0`` input with shape ``[batch_size, max_sequence_length]`` contains ``batch_size`` decoded input sentences. Every sentence is decoded the same way as indices of sentence elements in vocabulary and padded with index of ``eos`` (end of sentence symbol). If the length of the sentence is less than ``max_sequence_length``, remaining elements are filled with index of ``eos`` token. - -* ``IteratorGetNext/placeholder_out_port_1`` input with shape ``[batch_size]`` contains sequence lengths for every sentence from the first input. For example, if ``max_sequence_length = 50``, ``batch_size = 1`` and the sentence has only 30 elements, then the input tensor for ``IteratorGetNext/placeholder_out_port_1`` should be ``[30]``. - - -Outputs of the model: - -* ``dynamic_seq2seq/decoder/decoder/GatherTree`` tensor with shape ``[max_sequence_length * 2, batch, beam_size]``, - that contains ``beam_size`` best translations for every sentence from input (also decoded as indices of words in - vocabulary). - -.. note:: - The shape of this tensor in TensorFlow can be different: instead of ``max_sequence_length * 2``, it can be any value less than that, because OpenVINO does not support dynamic shapes of outputs, while TensorFlow can stop decoding iterations when ``eos`` symbol is generated. - -Running GNMT IR ---------------- - -1. With benchmark app: - - .. code-block:: sh - - benchmark_app -m -d CPU - - -2. With OpenVINO Runtime Python API: - - .. note:: - - Before running the example, insert a path to your GNMT ``.xml`` and ``.bin`` files into ``MODEL_PATH`` and ``WEIGHTS_PATH``, and fill ``input_data_tensor`` and ``seq_lengths`` tensors according to your input data. - - .. code-block:: py - :force: - - from openvino.inference_engine import IENetwork, IECore - - MODEL_PATH = '/path/to/IR/frozen_GNMT_inference_graph.xml' - WEIGHTS_PATH = '/path/to/IR/frozen_GNMT_inference_graph.bin' - - # Creating network - net = IENetwork( - model=MODEL_PATH, - weights=WEIGHTS_PATH) - - # Creating input data - input_data = {'IteratorGetNext/placeholder_out_port_0': input_data_tensor, - 'IteratorGetNext/placeholder_out_port_1': seq_lengths} - - # Creating plugin and loading extensions - ie = IECore() - ie.add_extension(extension_path="libcpu_extension.so", device_name="CPU") - - # Loading network - exec_net = ie.load_network(network=net, device_name="CPU") - - # Run inference - result_ie = exec_net.infer(input_data) - - -For more information about Python API, refer to the :doc:`OpenVINO Runtime Python API <../../../../../../api/ie_python_api/api>` guide. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst deleted file mode 100644 index 1b51809f9d1b6b..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-language-1b.rst +++ /dev/null @@ -1,131 +0,0 @@ -Converting a TensorFlow Language Model on One Billion Word Benchmark -==================================================================== - - -.. meta:: - :description: Learn how to convert a TensorFlow Language - Model on One Billion Word Benchmark to the OpenVINO Intermediate - Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -Downloading a Pre-trained Language Model on One Billion Word Benchmark -###################################################################### - -TensorFlow provides a pretrained `Language Model on One Billion Word Benchmark `__. - -To download the model for IR conversion, follow the instructions: - -1. Create new directory to store the model: - - .. code-block:: sh - - mkdir lm_1b - -2. Go to the ``lm_1b`` directory: - - .. code-block:: sh - - cd lm_1b - -3. Download the model GraphDef file: - - .. code-block:: sh - - wget http://download.tensorflow.org/models/LM_LSTM_CNN/graph-2016-09-10.pbtxt - -4. Create new directory to store 12 checkpoint shared files: - - .. code-block:: sh - - mkdir ckpt - -5. Go to the ``ckpt`` directory: - - .. code-block:: sh - - cd ckpt - -6. Download 12 checkpoint shared files: - - .. code-block:: sh - - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-base - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-char-embedding - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-lstm - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax0 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax1 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax2 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax3 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax4 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax5 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax6 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax7 - wget http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax8 - - -Once you have downloaded the pretrained model files, you will have the ``lm_1b`` directory with the following hierarchy: - -.. code-block:: sh - - lm_1b/ - graph-2016-09-10.pbtxt - ckpt/ - ckpt-base - ckpt-char-embedding - ckpt-lstm - ckpt-softmax0 - ckpt-softmax1 - ckpt-softmax2 - ckpt-softmax3 - ckpt-softmax4 - ckpt-softmax5 - ckpt-softmax6 - ckpt-softmax7 - ckpt-softmax8 - - - -.. image:: ../../../../../../assets/images/lm_1b.svg - -The frozen model still has two variables: ``Variable`` and ``Variable_1``. -It means that the model keeps training those variables at each inference. - -At the first inference of this graph, the variables are initialized by initial values. -After executing the ``lstm`` nodes, results of execution are assigned to these two variables. - -With each inference of the ``lm_1b`` graph, ``lstm`` initial states data is taken from previous inference -from variables, and states of current inference of ``lstm`` is reassigned to the same variables. - -It helps the model to remember the context of the words that it takes as input. - -Converting a TensorFlow Language Model on One Billion Word Benchmark to IR -########################################################################## - -Model Optimizer assumes that output model is for inference only. -Therefore, you should cut those variables off and resolve keeping cell and hidden states on application level. - -There is a certain limitation for the model conversion: the original model cannot be reshaped, so you should keep original shapes. - -To generate the ``lm_1b`` Intermediate Representation (IR), provide TensorFlow ``lm_1b`` model to the -Model Optimizer with parameters: - -.. code-block:: sh - - mo - --input_model lm_1b/graph-2016-09-10.pbtxt \ - --input_checkpoint lm_1b/ckpt \ - --input_model_is_text \ - --input_shape [50],[50],[1,9216],[1,9216] \ - --output softmax_out,lstm/lstm_0/concat_2,lstm/lstm_1/concat_2 \ - --input char_embedding/EmbeddingLookupUnique/Unique:0,char_embedding/EmbeddingLookupUnique/Unique:1,Variable/read,Variable_1/read - -Where: - -* ``--input char_embedding/EmbeddingLookupUnique/Unique:0,char_embedding/EmbeddingLookupUnique/Unique:1,Variable/read,Variable_1/read`` and ``--input_shape [50],[50],[1,9216],[1,9216]`` replace the variables with a placeholder. -* ``--output softmax_out,lstm/lstm_0/concat_2,lstm/lstm_1/concat_2`` specifies output node name and names of LSTM cell states. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst deleted file mode 100644 index a8592e75d65b31..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-ncf.rst +++ /dev/null @@ -1,68 +0,0 @@ -Converting a TensorFlow Neural Collaborative Filtering Model -============================================================ - - -.. meta:: - :description: Learn how to convert a Neural Collaborative - Filtering Model from TensorFlow to the OpenVINO Intermediate - Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This tutorial explains how to convert Neural Collaborative Filtering (NCF) model to the OpenVINO Intermediate Representation. - -`Public TensorFlow NCF model `__ does not contain pre-trained weights. To convert this model to the IR: - -1. Use `the instructions `__ from this repository to train the model. - -2. Freeze the inference graph you get in the previous step in ``model_dir``, following the instructions from the **Freezing Custom Models in Python** section of the :doc:`Converting a TensorFlow Model <../[legacy]-convert-tensorflow>` guide. - - Run the following commands: - - .. code-block:: py - :force: - - import tensorflow as tf - from tensorflow.python.framework import graph_io - - sess = tf.compat.v1.Session() - saver = tf.compat.v1.train.import_meta_graph("/path/to/model/model.meta") - saver.restore(sess, tf.train.latest_checkpoint('/path/to/model/')) - - frozen = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, \ - ["rating/BiasAdd"]) - graph_io.write_graph(frozen, './', 'inference_graph.pb', as_text=False) - - where ``rating/BiasAdd`` is an output node. - -3. Convert the model to the OpenVINO format. If you look at your frozen model, you can see that it has one input that is split into four ``ResourceGather`` layers. (Click image to zoom in.) - - .. image:: ../../../../../../assets/images/NCF_start.svg - - However, as the model conversion API does not support such data feeding, you should skip it. Cut - the edges incoming in ``ResourceGather`` port 1: - - .. code-block:: sh - - mo --input_model inference_graph.pb \ - --input 1:embedding/embedding_lookup,1:embedding_1/embedding_lookup, \ - 1:embedding_2/embedding_lookup,1:embedding_3/embedding_lookup \ - --input_shape [256],[256],[256],[256] \ - --output_dir - - In the ``input_shape`` parameter, 256 specifies the ``batch_size`` for your model. - -Alternatively, you can do steps 2 and 3 in one command line: - -.. code-block:: sh - - mo --input_meta_graph /path/to/model/model.meta \ - --input 1:embedding/embedding_lookup,1:embedding_1/embedding_lookup, \ - 1:embedding_2/embedding_lookup,1:embedding_3/embedding_lookup \ - --input_shape [256],[256],[256],[256] --output rating/BiasAdd \ - --output_dir - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-object-detection.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-object-detection.rst deleted file mode 100644 index ad321a4abb3cda..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-object-detection.rst +++ /dev/null @@ -1,184 +0,0 @@ -Converting TensorFlow Object Detection API Models -================================================= - - -.. meta:: - :description: Learn how to convert Object Detection - API Models from TensorFlow to the OpenVINO Intermediate - Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -* Starting with the 2022.1 release, model conversion API can convert the TensorFlow Object Detection API Faster and Mask RCNNs topologies differently. By default, model conversion adds operation "Proposal" to the generated IR. This operation needs an additional input to the model with name "image_info" which should be fed with several values describing the preprocessing applied to the input image (refer to the :doc:`Proposal <../../../../../openvino-ir-format/operation-sets/operation-specs/detection/proposal-4>` operation specification for more information). However, this input is redundant for the models trained and inferred with equal size images. Model conversion API can generate IR for such models and insert operation :doc:`DetectionOutput <../../../../../openvino-ir-format/operation-sets/operation-specs/detection/detectionoutput-1>` instead of ``Proposal``. The `DetectionOutput` operation does not require additional model input "image_info". Moreover, for some models the produced inference results are closer to the original TensorFlow model. In order to trigger new behavior, the attribute "operation_to_add" in the corresponding JSON transformation configuration file should be set to value "DetectionOutput" instead of default one "Proposal". -* Starting with the 2021.1 release, model conversion API converts the TensorFlow Object Detection API SSDs, Faster and Mask RCNNs topologies keeping shape-calculating sub-graphs by default, so topologies can be re-shaped in the OpenVINO Runtime using dedicated reshape API. Refer to the :doc:`Using Shape Inference <../../../../../../openvino-workflow/running-inference/changing-input-shape>` guide for more information on how to use this feature. It is possible to change the both spatial dimensions of the input image and batch size. -* To generate IRs for TF 1 SSD topologies, model conversion API creates a number of ``PriorBoxClustered`` operations instead of a constant node with prior boxes calculated for the particular input image size. This change allows you to reshape the topology in the OpenVINO Runtime using dedicated API. The reshaping is supported for all SSD topologies except FPNs, which contain hardcoded shapes for some operations preventing from changing topology input shape. - -Converting a Model -################## - -You can download TensorFlow Object Detection API models from the `TensorFlow 1 Detection Model Zoo `__ or `TensorFlow 2 Detection Model Zoo `__. - -.. note:: - - Before converting, make sure you have configured model conversion API. For configuration steps, refer to the :doc:`Convert a Model <../../../legacy-conversion-api>`. - -To convert a TensorFlow Object Detection API model, run the ``mo`` command with the following required parameters: - -* ``input_model `` - File with a pretrained model (binary or text .pb file after freezing) OR ``saved_model_dir `` for the TensorFlow 2 models -* ``transformations_config `` - A subgraph replacement configuration file with transformations description. For the models downloaded from the TensorFlow Object Detection API zoo, you can find the configuration files in the ``/openvino/tools/mo/front/tf`` directory. Use: - - * ``ssd_v2_support.json`` - for frozen SSD topologies from the models zoo version up to 1.13.X inclusively - * ``ssd_support_api_v.1.14.json`` - for SSD topologies trained using the TensorFlow Object Detection API version 1.14 up to 1.14.X inclusively - * ``ssd_support_api_v.1.15.json`` - for SSD topologies trained using the TensorFlow Object Detection API version 1.15 up to 2.0 - * ``ssd_support_api_v.2.0.json`` - for SSD topologies trained using the TensorFlow Object Detection API version 2.0 up to 2.3.X inclusively - * ``ssd_support_api_v.2.4.json`` - for SSD topologies trained using the TensorFlow Object Detection API version 2.4 or higher - * ``efficient_det_support_api_v.2.0.json`` - for EfficientDet topologies trained using the TensorFlow Object Detection API version 2.0 up to 2.3.X inclusively - * ``efficient_det_support_api_v.2.4.json`` - for EfficientDet topologies trained using the TensorFlow Object Detection API version 2.4 or higher - * ``faster_rcnn_support.json`` - for Faster R-CNN topologies from the TF 1.X models zoo trained with TensorFlow version up to 1.6.X inclusively - * ``faster_rcnn_support_api_v1.7.json`` - for Faster R-CNN topologies trained using the TensorFlow Object Detection API version 1.7.0 up to 1.9.X inclusively - * ``faster_rcnn_support_api_v1.10.json`` - for Faster R-CNN topologies trained using the TensorFlow Object Detection API version 1.10.0 up to 1.12.X inclusively - * ``faster_rcnn_support_api_v1.13.json`` - for Faster R-CNN topologies trained using the TensorFlow Object Detection API version 1.13.X - * ``faster_rcnn_support_api_v1.14.json`` - for Faster R-CNN topologies trained using the TensorFlow Object Detection API version 1.14.0 up to 1.14.X inclusively - * ``faster_rcnn_support_api_v1.15.json`` - for Faster R-CNN topologies trained using the TensorFlow Object Detection API version 1.15.0 up to 2.0 - * ``faster_rcnn_support_api_v2.0.json`` - for Faster R-CNN topologies trained using the TensorFlow Object Detection API version 2.0 up to 2.3.X inclusively - * ``faster_rcnn_support_api_v2.4.json`` - for Faster R-CNN topologies trained using the TensorFlow Object Detection API version 2.4 or higher - * ``mask_rcnn_support.json`` - for Mask R-CNN topologies from the TF 1.X models zoo trained with TensorFlow version 1.9.0 or lower. - * ``mask_rcnn_support_api_v1.7.json`` - for Mask R-CNN topologies trained using the TensorFlow Object Detection API version 1.7.0 up to 1.9.X inclusively - * ``mask_rcnn_support_api_v1.11.json`` - for Mask R-CNN topologies trained using the TensorFlow Object Detection API version 1.11.0 up to 1.12.X inclusively - * ``mask_rcnn_support_api_v1.13.json`` - for Mask R-CNN topologies trained using the TensorFlow Object Detection API version 1.13.0 up to 1.13.X inclusively - * ``mask_rcnn_support_api_v1.14.json`` - for Mask R-CNN topologies trained using the TensorFlow Object Detection API version 1.14.0 up to 1.14.X inclusively - * ``mask_rcnn_support_api_v1.15.json`` - for Mask R-CNN topologies trained using the TensorFlow Object Detection API version 1.15.0 up to 2.0 - * ``mask_rcnn_support_api_v2.0.json`` - for Mask R-CNN topologies trained using the TensorFlow Object Detection API version 2.0 up to 2.3.X inclusively - * ``mask_rcnn_support_api_v2.4.json`` - for Mask R-CNN topologies trained using the TensorFlow Object Detection API version 2.4 or higher - * ``rfcn_support.json`` - for RFCN topology from the models zoo trained with TensorFlow version up to 1.9.X inclusively - * ``rfcn_support_api_v1.10.json`` - for RFCN topology from the models zoo frozen with TensorFlow version 1.10.0 up to 1.12.X inclusively - * ``rfcn_support_api_v1.13.json`` - for RFCN topology from the models zoo frozen with TensorFlow version 1.13.X - * ``rfcn_support_api_v1.14.json`` - for RFCN topology from the models zoo frozen with TensorFlow version 1.14.0 or higher - -* ``tensorflow_object_detection_api_pipeline_config `` - A special configuration file that describes the topology hyper-parameters and structure of the TensorFlow Object Detection API model. For the models downloaded from the TensorFlow Object Detection API zoo, the configuration file is named ``pipeline.config``. If you plan to train a model yourself, you can find templates for these files in the `models repository `__. -* ``input_shape`` (optional) - A custom input image shape. For more information how the ``input_shape`` parameter is handled for the TensorFlow Object Detection API models, refer to the `Custom Input Shape <#Custom-Input-Shape>`__ guide. - -.. note:: - - The color channel order (RGB or BGR) of an input data should match the channel order of the model training dataset. If they are different, perform the ``RGB<->BGR`` conversion specifying the command-line parameter: ``reverse_input_channels``. Otherwise, inference results may be incorrect. If you convert a TensorFlow Object Detection API model to use with the OpenVINO sample applications, you must specify the ``reverse_input_channels`` parameter. For more information about the parameter, refer to the **When to Reverse Input Channels** section of the :doc:`Converting a Model to Intermediate Representation (IR) <../../[legacy]-setting-input-shapes>` guide. - -Additionally to the mandatory parameters listed above you can use optional conversion parameters if needed. A full list of parameters is available in the :doc:`Converting a TensorFlow Model <../[legacy]-convert-tensorflow>` guide. - -For example, if you downloaded the pre-trained `SSD InceptionV2 topology `__ and extracted archive to the directory ``/tmp/ssd_inception_v2_coco_2018_01_28``, the sample command line to convert the model looks as follows: - -.. code-block:: sh - - mo --input_model=/tmp/ssd_inception_v2_coco_2018_01_28/frozen_inference_graph.pb --transformations_config front/tf/ssd_v2_support.json --tensorflow_object_detection_api_pipeline_config /tmp/ssd_inception_v2_coco_2018_01_28/pipeline.config --reverse_input_channels - - -OpenVINO™ Toolkit Samples and Open Model Zoo Demos -################################################## - -OpenVINO comes with a number of samples to demonstrate use of OpenVINO Runtime API. Additionally, -Open Model Zoo provides set of demo applications to show implementation of close to real life applications, -based on deep learning in various tasks, including Image Classification, Visual Object Detection, Text Recognition, -Speech Recognition, Natural Language Processing and others. Refer to the links below for more details. - -* :doc:`OpenVINO Samples <../../../../../../learn-openvino/openvino-samples>` -* :doc:`Open Model Zoo Demos <../../../../model-zoo>` - -.. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. - -Feeding Input Images to the Samples -################################### - -There are several important notes about feeding input images to the samples: - -1. OpenVINO samples stretch input image to the size of the input operation without preserving aspect ratio. This behavior is usually correct for most topologies (including SSDs), but incorrect for other models like Faster R-CNN, Mask R-CNN and R-FCN. These models usually use keeps aspect ratio resizer. The type of preprocessing is defined in the pipeline configuration file in the section ``image_resizer``. If keeping aspect ratio is used, then it is necessary to resize image before passing it to the sample and optionally pad the resized image with 0s (if the attribute "pad_to_max_dimension" in the pipeline.config is equal to "true"). - -2. TensorFlow implementation of image resize may be different from the one implemented in the sample. Even reading input image from compressed format (like ``.jpg``) could give different results in the sample and TensorFlow. If it is necessary to compare accuracy between the TensorFlow and the OpenVINO, it is recommended to pass pre-resized input image in a non-compressed format (like ``.bmp``). - -3. If you want to infer the model with the OpenVINO samples, convert the model specifying the ``reverse_input_channels`` command line parameter. The samples load images in BGR channels order, while TensorFlow models were trained with images in RGB order. When the ``reverse_input_channels`` command line parameter is specified, model conversion API performs first convolution or other channel dependent operation weights modification so the output will be like the image is passed with RGB channels order. - -4. Read carefully the messages printed by model conversion API. They contain important instructions on how to prepare input data before running the inference and how to interpret the output. - -Custom Input Shape -################## - -Model conversion handles the command line parameter ``input_shape`` for TensorFlow Object Detection API models in a special way depending on the image resizer type defined in the ``pipeline.config`` file. TensorFlow Object Detection API generates different ``Preprocessor`` sub-graph based on the image resizer type. Model conversion API supports two types of image resizer: - -* ``fixed_shape_resizer`` --- *Stretches* input image to the specific height and width. The ``pipeline.config`` snippet below shows a ``fixed_shape_resizer`` sample definition: - - .. code-block:: sh - - image_resizer { - fixed_shape_resizer { - height: 300 - width: 300 - } - } - -* ``keep_aspect_ratio_resizer`` --- Resizes the input image *keeping aspect ratio* to satisfy the minimum and maximum size constraints. The ``pipeline.config`` snippet below shows a ``keep_aspect_ratio_resizer`` sample definition: - - .. code-block:: sh - - image_resizer { - keep_aspect_ratio_resizer { - min_dimension: 600 - max_dimension: 1024 - } - } - -If an additional parameter "pad_to_max_dimension" is equal to "true", then the resized image will be padded with 0s to the square image of size "max_dimension". - -Fixed Shape Resizer Replacement -+++++++++++++++++++++++++++++++ - -* If the ``input_shape`` command line parameter is not specified, model conversion generates an input operation with the height and width as defined in the ``pipeline.config``. - -* If the ``input_shape [1, H, W, 3]`` command line parameter is specified, model conversion sets the input operation height to ``H`` and width to ``W`` and convert the model. However, the conversion may fail because of the following reasons: - - * The model is not reshape-able, meaning that it's not possible to change the size of the model input image. For example, SSD FPN models have ``Reshape`` operations with hard-coded output shapes, but the input size to these ``Reshape`` instances depends on the input image size. In this case, model conversion API shows an error during the shape inference phase. Run model conversion with ``log_level DEBUG`` to see the inferred operations output shapes to see the mismatch. - * Custom input shape is too small. For example, if you specify ``input_shape [1,100,100,3]`` to convert a SSD Inception V2 model, one of convolution or pooling nodes decreases input tensor spatial dimensions to non-positive values. In this case, model conversion API shows error message like this: '[ ERROR ] Shape [ 1 -1 -1 256] is not fully defined for output X of "node_name".' - - -Keeping Aspect Ratio Resizer Replacement -++++++++++++++++++++++++++++++++++++++++ - -* If the ``input_shape`` command line parameter is not specified, model conversion API generates an input operation with both height and width equal to the value of parameter ``min_dimension`` in the ``keep_aspect_ratio_resizer``. - -* If the ``input_shape [1, H, W, 3]`` command line parameter is specified, model conversion API scales the specified input image height ``H`` and width ``W`` to satisfy the ``min_dimension`` and ``max_dimension`` constraints defined in the ``keep_aspect_ratio_resizer``. The following function calculates the input operation height and width: - - .. code-block:: py - :force: - - def calculate_shape_keeping_aspect_ratio(H: int, W: int, min_dimension: int, max_dimension: int): - ratio_min = min_dimension / min(H, W) - ratio_max = max_dimension / max(H, W) - ratio = min(ratio_min, ratio_max) - return int(round(H * ratio)), int(round(W * ratio)) - -The ``input_shape`` command line parameter should be specified only if the "pad_to_max_dimension" does not exist of is set to "false" in the ``keep_aspect_ratio_resizer``. - -Models with ``keep_aspect_ratio_resizer`` were trained to recognize object in real aspect ratio, in contrast with most of the classification topologies trained to recognize objects stretched vertically and horizontally as well. By default, topologies are converted with ``keep_aspect_ratio_resizer`` to consume a square input image. If the non-square image is provided as input, it is stretched without keeping aspect ratio that results to object detection quality decrease. - -.. note:: - - It is highly recommended to specify the ``input_shape`` command line parameter for the models with ``keep_aspect_ratio_resizer``, if the input image dimensions are known in advance. - -Model Conversion Process in Detail -################################## - -This section is intended for users who want to understand how model conversion API performs Object Detection API models conversion in details. The information in this section is also useful for users having complex models that are not converted with model conversion API out of the box. It is highly recommended to read the **Graph Transformation Extensions** section in the :doc:`[Legacy] Model Optimizer Extensibility <../../../legacy-model-optimizer-extensibility>` documentation first to understand sub-graph replacement concepts which are used here. - -It is also important to open the model in the `TensorBoard `__ to see the topology structure. Model conversion API can create an event file that can be then fed to the TensorBoard tool. Run model conversion, providing two command line parameters: - -* ``input_model `` --- Path to the frozen model. -* ``tensorboard_logdir`` --- Path to the directory where TensorBoard looks for the event files. - -Implementation of the transformations for Object Detection API models is located in the `file `__. Refer to the code in this file to understand the details of the conversion process. - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-retina-net.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-retina-net.rst deleted file mode 100644 index db2c6424367f58..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-retina-net.rst +++ /dev/null @@ -1,31 +0,0 @@ -Converting a TensorFlow RetinaNet Model -======================================= - - -.. meta:: - :description: Learn how to convert a RetinaNet model - from TensorFlow to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python ../../../../../../learn-openvino/interactive-tutorials-python <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This tutorial explains how to convert a RetinaNet model to the Intermediate Representation (IR). - -`Public RetinaNet model `__ does not contain pretrained TensorFlow weights. -To convert this model to the TensorFlow format, follow the `Reproduce Keras to TensorFlow Conversion tutorial `__. - -After converting the model to TensorFlow format, run the following command: - -.. code-block:: sh - - mo --input "input_1[1,1333,1333,3]" --input_model retinanet_resnet50_coco_best_v2.1.0.pb --transformations_config front/tf/retinanet.json - - -Where ``transformations_config`` command-line parameter specifies the configuration json file containing model conversion hints for model conversion API. -The json file contains some parameters that need to be changed if you train the model yourself. It also contains information on how to match endpoints -to replace the subgraph nodes. After the model is converted to the OpenVINO IR format, the output nodes will be replaced with DetectionOutput layer. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-slim-library.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-slim-library.rst deleted file mode 100644 index 847d44fce813b1..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-slim-library.rst +++ /dev/null @@ -1,117 +0,0 @@ -Converting TensorFlow Slim Image Classification Model Library Models -==================================================================== - - -.. meta:: - :description: Learn how to convert a Slim Image - Classification model from TensorFlow to the OpenVINO - Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -`TensorFlow-Slim Image Classification Model Library `__ is a library to define, train and evaluate classification models in TensorFlow. The library contains Python scripts defining the classification topologies together with checkpoint files for several pre-trained classification topologies. To convert a TensorFlow-Slim library model, complete the following steps: - -1. Download the TensorFlow-Slim models `git repository `__. -2. Download the pre-trained model `checkpoint `__. -3. Export the inference graph. -4. Convert the model using model conversion API. - -The `Example of an Inception V1 Model Conversion <#example_of_an_inception_v1_model_conversion>`__ below illustrates the process of converting an Inception V1 Model. - -Example of an Inception V1 Model Conversion -########################################### - -This example demonstrates how to convert the model on Linux OSes, but it could be easily adopted for the Windows OSes. - -**Step 1**. Create a new directory to clone the TensorFlow-Slim git repository to: - -.. code-block:: sh - - mkdir tf_models - -.. code-block:: sh - - git clone https://github.com/tensorflow/models.git tf_models - - -**Step 2**. Download and unpack the `Inception V1 model checkpoint file `__: - -.. code-block:: sh - - wget http://download.tensorflow.org/models/inception_v1_2016_08_28.tar.gz - -.. code-block:: sh - - tar xzvf inception_v1_2016_08_28.tar.gz - -**Step 3**. Export the inference graph --- the protobuf file (``.pb``) containing the architecture of the topology. This file *does not* contain the neural network weights and cannot be used for inference. - -.. code-block:: sh - - python3 tf_models/research/slim/export_inference_graph.py \ - --model_name inception_v1 \ - --output_file inception_v1_inference_graph.pb - - -Model conversion API comes with the summarize graph utility, which identifies graph input and output nodes. Run the utility to determine input/output nodes of the Inception V1 model: - -.. code-block:: sh - - python3 /openvino/tools/mo/utils/summarize_graph.py --input_model ./inception_v1_inference_graph.pb - -The output looks as follows: - -.. code-block:: sh - - 1 input(s) detected: - Name: input, type: float32, shape: (-1,224,224,3) - 1 output(s) detected: - InceptionV1/Logits/Predictions/Reshape_1 - -The tool finds one input node with name ``input``, type ``float32``, fixed image size ``(224,224,3)`` and undefined batch size ``-1``. The output node name is ``InceptionV1/Logits/Predictions/Reshape_1``. - -**Step 4**. Convert the model with the model conversion API: - -.. code-block:: sh - - mo --input_model ./inception_v1_inference_graph.pb --input_checkpoint ./inception_v1.ckpt -b 1 --mean_value [127.5,127.5,127.5] --scale 127.5 - - -The ``-b`` command line parameter is required because model conversion API cannot convert a model with undefined input size. - -For the information on why ``--mean_values`` and ``--scale`` command-line parameters are used, refer to the `Mean and Scale Values for TensorFlow-Slim Models <#Mean-and-Scale-Values-for-TensorFlow-Slim-Models>`__. - -Mean and Scale Values for TensorFlow-Slim Models -################################################# - -The TensorFlow-Slim Models were trained with normalized input data. There are several different normalization algorithms used in the Slim library. OpenVINO classification sample does not perform image pre-processing except resizing to the input layer size. It is necessary to pass mean and scale values to model conversion API so they are embedded into the generated IR in order to get correct classification results. - -The file `preprocessing_factory.py `__ contains a dictionary variable ``preprocessing_fn_map`` defining mapping between the model type and pre-processing function to be used. The function code should be analyzed to figure out the mean/scale values. - -The `inception_preprocessing.py `__ file defines the pre-processing function for the Inception models. The ``preprocess_for_eval`` function contains the following code: - -.. code-block:: py - :force: - - ... - import tensorflow as tf - if image.dtype != tf.float32: - image = tf.image.convert_image_dtype(image, dtype=tf.float32) - ... - image = tf.subtract(image, 0.5) - image = tf.multiply(image, 2.0) - return image - - -Firstly, the ``image`` is converted to data type `tf.float32` and the values in the tensor are scaled to the ``[0, 1]`` range using the `tf.image.convert_image_dtype `__ function. Then the ``0.5`` is subtracted from the image values and values multiplied by ``2.0``. The final image range of values is ``[-1, 1]``. - -OpenVINO classification sample reads an input image as a three-dimensional array of integer values from the range ``[0, 255]``. In order to scale them to ``[-1, 1]`` range, the mean value ``127.5`` for each image channel should be specified as well as a scale factor ``127.5``. - -Similarly, the mean/scale values can be determined for other Slim models. - -The exact mean/scale values are defined in the table with list of supported TensorFlow-Slim models at the :doc:`Converting a TensorFlow Model <../[legacy]-convert-tensorflow>` guide. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-wide-and-deep-family.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-wide-and-deep-family.rst deleted file mode 100644 index d2f83fa12d8e67..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-wide-and-deep-family.rst +++ /dev/null @@ -1,166 +0,0 @@ -Converting TensorFlow Wide and Deep Family Models -================================================= - - -.. meta:: - :description: Learn how to convert Wide and Deep Family - models from TensorFlow to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -The Wide and Deep models is a combination of wide and deep parts for memorization and generalization of object features respectively. -These models can contain different types of object features such as numerical, categorical, sparse and sequential features. These feature types are specified -through Tensorflow tf.feature_column API. Table below presents what feature types are supported by the OpenVINO toolkit. - -.. list-table:: - :header-rows: 1 - - * - numeric - - (weighted) categorical - - categorical with hash - - bucketized - - sequential - - crossed - * - yes - - yes - - no - - yes - - yes - - no - - -.. note:: The categorical with hash and crossed features are currently unsupported since OpenVINO does not cover tensors of the `string` type and operations with them. - -Preparing an Example of Wide and Deep Model -########################################### - -**Step 1**. Clone the GitHub repository with TensorFlow models and move to the directory with an example of Wide and Deep model: - -.. code-block:: sh - - git clone https://github.com/tensorflow/models.git --branch r2.2.0; - cd official/r1/wide_deep - - -The Wide and Deep model is no longer in the master branch of the repository but is still available in the r2.2.0 branch. - - -**Step 2**. Train the model - -As the OpenVINO™ toolkit does not support the categorical with hash and crossed features, such feature types must be switched off in the model -by changing the ``build_model_columns()`` function in `census_dataset.py` as follows: - -.. code-block:: py - :force: - - def build_model_columns(): - """Builds a set of wide and deep feature columns.""" - # Continuous variable columns - age = tf.feature_column.numeric_column('age') - education_num = tf.feature_column.numeric_column('education_num') - capital_gain = tf.feature_column.numeric_column('capital_gain') - capital_loss = tf.feature_column.numeric_column('capital_loss') - hours_per_week = tf.feature_column.numeric_column('hours_per_week') - education = tf.feature_column.categorical_column_with_vocabulary_list( - 'education', [ - 'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', - 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', - '5th-6th', '10th', '1st-4th', 'Preschool', '12th']) - marital_status = tf.feature_column.categorical_column_with_vocabulary_list( - 'marital_status', [ - 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', - 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed']) - relationship = tf.feature_column.categorical_column_with_vocabulary_list( - 'relationship', [ - 'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', - 'Other-relative']) - workclass = tf.feature_column.categorical_column_with_vocabulary_list( - 'workclass', [ - 'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', - 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked']) - # To show an example of hashing: - #occupation = tf.feature_column.categorical_column_with_hash_bucket( - # 'occupation', hash_bucket_size=_HASH_BUCKET_SIZE) - # Transformations. - age_buckets = tf.feature_column.bucketized_column( - age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) - # Wide columns and deep columns. - base_columns = [ - education, marital_status, relationship, workclass, - age_buckets, - ] - crossed_columns = [] - wide_columns = base_columns + crossed_columns - deep_columns = [ - age, - education_num, - capital_gain, - capital_loss, - hours_per_week, - tf.feature_column.indicator_column(workclass), - tf.feature_column.indicator_column(education), - tf.feature_column.indicator_column(marital_status), - tf.feature_column.indicator_column(relationship), - # To show an example of embedding - ] - return wide_columns, deep_columns - -After that, start training with the following command: - -.. code-block:: sh - - python census_main.py - - -Converting the Wide and Deep Model to IR -######################################## - -Use the following command line to convert the saved model file with the checkpoint: - -.. code-block:: sh - - mo - --input_checkpoint checkpoint --input_meta_graph model.ckpt.meta - --input "IteratorGetNext:0[2], - IteratorGetNext:1[2], - IteratorGetNext:2[2], - IteratorGetNext:4[2], - IteratorGetNext:7[2], - linear/linear_model/linear_model/linear_model/education/to_sparse_input/indices:0[10,2]{i64}, - linear/linear_model/linear_model/linear_model/education/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - linear/linear_model/linear_model/linear_model/education/to_sparse_input/dense_shape:0[2]{i64}->[2,50], - linear/linear_model/linear_model/linear_model/marital_status/to_sparse_input/indices:0[10,2]{i64}, - linear/linear_model/linear_model/linear_model/marital_status/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - linear/linear_model/linear_model/linear_model/marital_status/to_sparse_input/dense_shape:0[2]{i64}->[2,50], - linear/linear_model/linear_model/linear_model/relationship/to_sparse_input/indices:0[10,2]{i64}, - linear/linear_model/linear_model/linear_model/relationship/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - linear/linear_model/linear_model/linear_model/relationship/to_sparse_input/dense_shape:0[2]{i64}->[2,50], - linear/linear_model/linear_model/linear_model/workclass/to_sparse_input/indices:0[10,2]{i64}, - linear/linear_model/linear_model/linear_model/workclass/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - linear/linear_model/linear_model/linear_model/workclass/to_sparse_input/dense_shape:0[2]{i64}->[2,50], - dnn/input_from_feature_columns/input_layer/education_indicator/to_sparse_input/indices:0[10,2]{i64}, - dnn/input_from_feature_columns/input_layer/education_indicator/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - dnn/input_from_feature_columns/input_layer/education_indicator/to_sparse_input/dense_shape:0[2]{i64}->[2,50], - dnn/input_from_feature_columns/input_layer/marital_status_indicator/to_sparse_input/indices:0[10,2]{i64}, - dnn/input_from_feature_columns/input_layer/marital_status_indicator/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - dnn/input_from_feature_columns/input_layer/marital_status_indicator/to_sparse_input/dense_shape:0[2]{i64}->[2,50], - dnn/input_from_feature_columns/input_layer/relationship_indicator/to_sparse_input/indices:0[10,2]{i64}, - dnn/input_from_feature_columns/input_layer/relationship_indicator/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - dnn/input_from_feature_columns/input_layer/relationship_indicator/to_sparse_input/dense_shape:0[2]{i64}->[2,50], - dnn/input_from_feature_columns/input_layer/workclass_indicator/to_sparse_input/indices:0[10,2]{i64}, - dnn/input_from_feature_columns/input_layer/workclass_indicator/hash_table_Lookup/LookupTableFindV2:0[10]{i64}, - dnn/input_from_feature_columns/input_layer/workclass_indicator/to_sparse_input/dense_shape:0[2]{i64}->[2,50]" - --output head/predictions/probabilities - - -The model contains operations unsupported by the OpenVINO™ toolkit such as ``IteratorGetNext`` and ``LookupTableFindV2``, so the Model Optimizer must prune these nodes. -The pruning is specified through `--input` option. The prunings for ``IteratorGetNext:*`` nodes correspond to numeric features. -The pruning for each categorical feature consists of three prunings for the following nodes: ``*/to_sparse_input/indices:0``, ``*/hash_table_Lookup/LookupTableFindV2:0``, and ``*/to_sparse_input/dense_shape:0``. - -The above command line generates an OpenVINO model for a batch of two objects, with the total number of actual categorical feature values equal to 10 and maximum size of a sparse categorical feature for one object equal to 50. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-xlnet.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-xlnet.rst deleted file mode 100644 index 853614de85feed..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-xlnet.rst +++ /dev/null @@ -1,208 +0,0 @@ -Converting a TensorFlow XLNet Model -=================================== - - -.. meta:: - :description: Learn how to convert an XLNet model from - TensorFlow to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -Pretrained models for XLNet (Bidirectional Encoder Representations from Transformers) are -`publicly available `__. - -Supported Models -################ - -The following models from the pretrained `XLNet model list `__ are currently supported: - -* `XLNet-Large, Cased `__ -* `XLNet-Base, Cased `__ - -Downloading the Pretrained Base XLNet Model -########################################### - -Download and unzip an archive with the `XLNet-Base, Cased `__. - -After the archive is unzipped, the directory ``cased_L-12_H-768_A-12`` is created and contains the following files: - -* TensorFlow checkpoint (``xlnet_model.ckpt``), containing the pretrained weights (which is actually 3 files) -* sentence piece model (``spiece.model``) used for (de)tokenization -* config file (``xlnet_config.json``), which specifies the hyperparameters of the model - -To get pb-file from the archive contents, you need to do the following. - -1. Run commands - - .. code-block:: sh - - cd ~ - mkdir XLNet-Base - cd XLNet-Base - git clone https://github.com/zihangdai/xlnet - wget https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip - unzip cased_L-12_H-768_A-12.zip - mkdir try_save - - -2. Save and run the following Python script in `~/XLNet-Base/xlnet`: - - .. note:: The original model repository has been tested with TensorFlow 1.13.1 under Python2. - - .. code-block:: py - :force: - - from collections import namedtuple - - import tensorflow as tf - from tensorflow.python.framework import graph_io - - import model_utils - import xlnet - - LENGTHS = 50 - BATCH = 1 - OUTPUT_DIR = '~/XLNet-Base/try_save/' - INIT_CKPT_PATH = '~/XLNet-Base/xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt' - XLNET_CONFIG_PATH = '~/XLNet-Base/xlnet_cased_L-12_H-768_A-12/xlnet_config.json' - - FLags = namedtuple('FLags', 'use_tpu init_checkpoint') - FLAGS = FLags(use_tpu=False, init_checkpoint=INIT_CKPT_PATH) - - xlnet_config = xlnet.XLNetConfig(json_path=XLNET_CONFIG_PATH) - run_config = xlnet.RunConfig(is_training=False, use_tpu=False, use_bfloat16=False, dropout=0.1, dropatt=0.1,) - - - sentence_features_input_idx = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='input_ids') - sentence_features_segment_ids = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='seg_ids') - sentence_features_input_mask = tf.compat.v1.placeholder(tf.float32, shape=[LENGTHS, BATCH], name='input_mask') - - with tf.compat.v1.Session() as sess: - xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, - input_ids=sentence_features_input_idx, - seg_ids=sentence_features_segment_ids, - input_mask=sentence_features_input_mask) - - sess.run(tf.compat.v1.global_variables_initializer()) - model_utils.init_from_checkpoint(FLAGS, True) - - # Save the variables to disk. - saver = tf.compat.v1.train.Saver() - - # Saving checkpoint - save_path = saver.save(sess, OUTPUT_DIR + "model.ckpt") - - # Freezing model - outputs = ['model/transformer/dropout_2/Identity'] - graph_def_freezed = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), outputs) - - # Saving non-frozen and frozen model to pb - graph_io.write_graph(sess.graph.as_graph_def(), OUTPUT_DIR, 'model.pb', as_text=False) - graph_io.write_graph(graph_def_freezed,OUTPUT_DIR, 'model_frozen.pb', - as_text=False) - - # Write to tensorboard - with tf.compat.v1.summary.FileWriter(logdir=OUTPUT_DIR, graph_def=graph_def_freezed) as writer: - writer.flush() - -Downloading the Pretrained Large XLNet Model -############################################ - -Download and unzip an archive with the `XLNet-Base, Cased `__. - -After unzipping the archive, the directory ``cased_L-12_H-1024_A-16`` is created and contains the following files: - -* TensorFlow checkpoint (``xlnet_model.ckpt``) containing the pretrained weights (which is actually 3 files) -* sentence piece model (``spiece.model``) used for (de)tokenization -* config file (``xlnet_config.json``) which specifies the hyperparameters of the model - -To get ``pb-file`` from the archive contents, follow the instructions below: - -1. Run commands - - .. code-block:: sh - - cd ~ - mkdir XLNet-Large - cd XLNet-Large - git clone https://github.com/zihangdai/xlnet - wget https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip - unzip cased_L-24_H-1024_A-16.zip - mkdir try_save - - -2. Save and run the following Python script in ``~/XLNet-Large/xlnet``: - - .. code-block:: py - :force: - - from collections import namedtuple - - import tensorflow as tf - from tensorflow.python.framework import graph_io - - import model_utils - import xlnet - - LENGTHS = 50 - BATCH = 1 - OUTPUT_DIR = '~/XLNet-Large/try_save' - INIT_CKPT_PATH = '~/XLNet-Large/cased_L-24_H-1024_A-16/xlnet_model.ckpt' - XLNET_CONFIG_PATH = '~/XLNet-Large/cased_L-24_H-1024_A-16/xlnet_config.json' - - FLags = namedtuple('FLags', 'use_tpu init_checkpoint') - FLAGS = FLags(use_tpu=False, init_checkpoint=INIT_CKPT_PATH) - - xlnet_config = xlnet.XLNetConfig(json_path=XLNET_CONFIG_PATH) - run_config = xlnet.RunConfig(is_training=False, use_tpu=False, use_bfloat16=False, dropout=0.1, dropatt=0.1,) - - - sentence_features_input_idx = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='input_ids') - sentence_features_segment_ids = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='seg_ids') - sentence_features_input_mask = tf.compat.v1.placeholder(tf.float32, shape=[LENGTHS, BATCH], name='input_mask') - - with tf.compat.v1.Session() as sess: - xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, - input_ids=sentence_features_input_idx, - seg_ids=sentence_features_segment_ids, - input_mask=sentence_features_input_mask) - - sess.run(tf.compat.v1.global_variables_initializer()) - model_utils.init_from_checkpoint(FLAGS, True) - - # Save the variables to disk. - saver = tf.compat.v1.train.Saver() - - # Saving checkpoint - save_path = saver.save(sess, OUTPUT_DIR + "model.ckpt") - - # Freezing model - outputs = ['model/transformer/dropout_2/Identity'] - graph_def_freezed = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), outputs) - - # Saving non-frozen and frozen model to pb - graph_io.write_graph(sess.graph.as_graph_def(), OUTPUT_DIR, 'model.pb', as_text=False) - graph_io.write_graph(graph_def_freezed,OUTPUT_DIR, 'model_frozen.pb', - as_text=False) - - # Write to tensorboard - with tf.compat.v1.summary.FileWriter(logdir=OUTPUT_DIR, graph_def=graph_def_freezed) as writer: - writer.flush() - - -The script should save into ``~/XLNet-Large/xlnet``. - -Converting a frozen TensorFlow XLNet Model to IR -################################################# - -To generate the XLNet Intermediate Representation (IR) of the model, run model conversion with the following parameters: - -.. code-block:: sh - - mo --input_model path-to-model/model_frozen.pb \ - --input "input_mask[50,1],input_ids[50,1],seg_ids[50,1]" - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-yolo.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-yolo.rst deleted file mode 100644 index e7e8072b1bda05..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-yolo.rst +++ /dev/null @@ -1,322 +0,0 @@ -Converting TensorFlow YOLO Models -================================= - - -.. meta:: - :description: Learn how to convert YOLO models from - TensorFlow to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`. - -This document explains how to convert real-time object detection YOLOv1, YOLOv2, YOLOv3 and YOLOv4 public models to the Intermediate Representation (IR). All YOLO models are originally implemented in the DarkNet framework and consist of two files: - -* The ``.cfg`` file with model configurations -* The ``.weights`` file with model weights - -Depending on a YOLO model version, the ``convert_model()`` method converts it differently: - -- YOLOv4 must be first converted from Keras to TensorFlow 2. -- YOLOv3 has several implementations. This tutorial uses a TensorFlow implementation of YOLOv3 model, which can be directly converted to an IR. -- YOLOv1 and YOLOv2 models must be first converted to TensorFlow using DarkFlow. - -Converting a YOLOv4 Model to IR -############################### - -This section explains how to convert the YOLOv4 Keras model from the `repository `__ to an IR. To convert the YOLOv4 model, follow the instructions below: - -1. Download YOLOv4 weights and associated with it cfg file: - - - for YOLOv4 ( `weights `__ / `config file `__ ) - - for YOLOv4-tiny ( `weights `__ / `config file `__ ) - -2. Clone the repository with the YOLOv4 model: - - .. code-block:: sh - - git clone https://github.com/david8862/keras-YOLOv3-model-set - - -3. Convert the model to the TensorFlow 2 format: - - - for YOLOv4: - - .. code-block:: sh - - python keras-YOLOv3-model-set/tools/model_converter/convert.py /yolov4.cfg /yolov4.weights - - - - for YOLOv4-tiny: - - .. code-block:: sh - - python keras-YOLOv3-model-set/tools/model_converter/convert.py /yolov4-tiny.cfg /yolov4-tiny.weights - - -4. Run model conversion from the TensorFlow 2 to an IR format: - - .. note:: - - Before you run the conversion, make sure you have installed all the model conversion API dependencies for TensorFlow 2. - - If you get errors, you may need to add the additional step to divide the input by 255: - - .. code-block:: sh - - --scale_values=image_input[255] - - - .. code-block:: sh - - mo --saved_model_dir yolov4 --output_dir models/IRs --input_shape [1,608,608,3] --model_name yolov4 - - -Converting YOLOv3 Model to the OpenVINO format -############################################## - -There are several public versions of TensorFlow YOLOv3 model implementation available on GitHub. This section explains how to convert YOLOv3 model from -the `repository `__ (commit ed60b90) to an IR , but the process is similar for other versions of TensorFlow YOLOv3 model. - -Overview of YOLOv3 Model Architecture -+++++++++++++++++++++++++++++++++++++ - -Originally, YOLOv3 model includes feature extractor called ``Darknet-53`` with three branches at the end that make detections at three different scales. These branches must end with the YOLO ``Region`` layer. - -``Region`` layer was first introduced in the DarkNet framework. Other frameworks, including TensorFlow, do not have the ``Region`` implemented as a single layer, so every author of public YOLOv3 model creates it using simple layers. This badly affects performance. For this reason, the main idea of YOLOv3 model conversion to IR is to cut off these custom ``Region`` -like parts of the model and complete the model with the ``Region`` layers where required. - -Dumping a YOLOv3 TensorFlow Model -+++++++++++++++++++++++++++++++++ - -To dump TensorFlow model out of `GitHub repository `__ (commit ed60b90), follow the instructions below: - -1. Clone the repository: - - .. code-block:: sh - - git clone https://github.com/mystic123/tensorflow-yolo-v3.git - cd tensorflow-yolo-v3 - - -2. (Optional) Checkout to the commit that the conversion was tested on: - - .. code-block:: sh - - git checkout ed60b90 - - -3. Download `coco.names `__ file from the DarkNet website **OR** use labels that fit your task. -4. Download the `yolov3.weights `__ (for the YOLOv3 model) or `yolov3-tiny.weights `__ (for the YOLOv3-tiny model) file **OR** use your pre-trained weights with the same structure. -5. Install PIL, which is used by the conversion script in the repo: - - .. code-block:: sh - - pip install pillow - - -6. Run a converter: - - .. note:: This converter works with TensorFlow 1.x and numpy 1.19 or lower. - - - - For YOLO-v3: - - .. code-block:: sh - - python3 convert_weights_pb.py --class_names coco.names --data_format NHWC --weights_file yolov3.weights - - - - For YOLOv3-tiny: - - .. code-block:: sh - - python3 convert_weights_pb.py --class_names coco.names --data_format NHWC --weights_file yolov3-tiny.weights --tiny - - - At this step, you may receive a warning like ``WARNING:tensorflow:Entity <...> could not be transformed and will be executed as-is.``. To work around this issue, switch to gast 0.2.2 with the following command: - - .. code-block:: sh - - pip3 install --user gast==0.2.2 - - -If you have YOLOv3 weights trained for an input image with the size different from 416 (320, 608 or your own), provide the ``--size`` key with the size of your image specified while running the converter. For example, run the following command for an image with size 608: - -.. code-block:: sh - - python3 convert_weights_pb.py --class_names coco.names --data_format NHWC --weights_file yolov3_608.weights --size 608 - - -Converting a YOLOv3 TensorFlow Model to the OpenVINO format -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -To solve the problems explained in the `YOLOv3 architecture overview <#overview-of-yolov3-model-architecture>`__ section, use the ``yolo_v3.json`` or ``yolo_v3_tiny.json`` (depending on a model) configuration file with custom operations located in the ``/tools/model_optimizer/extensions/front/tf`` repository. - -It consists of several attributes: - -.. code-block:: sh - - [ - { - "id": "TFYOLOV3", - "match_kind": "general", - "custom_attributes": { - "classes": 80, - "anchors": [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326], - "coords": 4, - "num": 9, - "masks":[[6, 7, 8], [3, 4, 5], [0, 1, 2]], - "entry_points": ["detector/yolo-v3/Reshape", "detector/yolo-v3/Reshape_4", "detector/yolo-v3/Reshape_8"] - } - } - ] - - -where: - -- ``id`` and ``match_kind`` are parameters that you cannot change. -- ``custom_attributes`` is a parameter that stores all the YOLOv3 specific attributes: - - - ``classes``, ``coords``, ``num``, and ``masks`` are attributes that you should copy from the configuration file that was used for model training. If you used DarkNet officially shared weights, you can use ``yolov3.cfg`` or ``yolov3-tiny.cfg`` configuration file from `GitHub repository `__. Replace the default values in ``custom_attributes`` with the parameters that follow the ``[yolo]`` titles in the configuration file. - - ``anchors`` is an optional parameter that is not used while inference of the model, but it used in a demo to parse ``Region`` layer output - - ``entry_points`` is a node name list to cut off the model and append the ``Region`` layer with custom attributes specified above. - - -To generate an IR of the YOLOv3 TensorFlow model, run: - -.. code-block:: sh - - mo \ - --input_model /path/to/yolo_v3.pb \ - --transformations_config front/tf/yolo_v3.json \ - --batch 1 \ - --output_dir - - -To generate an IR of the YOLOv3-tiny TensorFlow model, run: - -.. code-block:: sh - - mo \ - --input_model /path/to/yolo_v3_tiny.pb \ - --transformations_config front/tf/yolo_v3_tiny.json \ - --batch 1 \ - --output_dir - - -where: - -* ``batch`` defines shape of model input. In the example, ``batch`` is equal to 1, but you can also specify other integers larger than 1. -* ``transformations_config`` adds missing ``Region`` layers to the model. In the IR, the ``Region`` layer has name ``RegionYolo``. - -.. note:: - - The color channel order (RGB or BGR) of an input data should match the channel order of the model training dataset. If they are different, perform the ``RGB<->BGR`` conversion specifying the command-line parameter: ``reverse_input_channels``. Otherwise, inference results may be incorrect. For more information about the parameter, refer to the **When to Reverse Input Channels** section of the :doc:`Converting a Model to Intermediate Representation (IR) <../../[legacy]-setting-input-shapes>` guide. - - -OpenVINO toolkit provides a demo that uses YOLOv3 model. Refer to the `Object Detection C++ Demo `__ for more information. - -Converting YOLOv1 and YOLOv2 Models to the IR -############################################# - -Before converting, choose a YOLOv1 or YOLOv2 model version that best suits your task. Download model configuration file and corresponding weight file: - -* From `DarkFlow repository `__ : configuration files are stored in the ``cfg`` directory, links to weight files are given in the ``README.md`` file. The files from this repository are adapted for conversion to TensorFlow using DarkFlow. -* From DarkNet website and repository: configuration files are stored in the ``cfg`` directory of the `repository `__, links to weight files are given on the `YOLOv1 `__ and `YOLOv2 `__ websites. - -To convert DarkNet YOLOv1 and YOLOv2 models to the OpenVINO format, follow these steps: - -1. `Install DarkFlow <#installing-darkflow>`__ -2. `Convert DarkNet YOLOv1 or YOLOv2 model to TensorFlow <#converting-a-darknet-yolov1-or-yolov2-model-to-tensorflow>`__ using DarkFlow -3. `Convert TensorFlow YOLOv1 or YOLOv2 model to IR <#converting-a-tensorflow-yolov1-or-yolov2-model-to-the-ir>`__ - - -Installing DarkFlow -+++++++++++++++++++++ - -You need DarkFlow to convert YOLOv1 and YOLOv2 models to TensorFlow. To install DarkFlow: - -1. Install DarkFlow `required dependencies `__. -2. Clone DarkFlow git repository: - - .. code-block:: sh - - git clone https://github.com/thtrieu/darkflow.git - - -3. Go to the root directory of the cloned repository: - - .. code-block:: sh - - cd darkflow - - -4. Install DarkFlow, using the instructions from the ``README.md`` file in the `DarkFlow repository `__. - - -Converting a DarkNet YOLOv1 or YOLOv2 Model to TensorFlow -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -To convert YOLOv1 or YOLOv2 model to TensorFlow, go to the root directory of the cloned DarkFlow repository, place the previously downloaded \*.cfg and \*.weights files in the current directory and run the following command: - -- For YOLOv1: - - .. code-block:: sh - - python3 flow --model yolov1.cfg --load yolov1.weights --savepb - - -- For YOLOv2 with VOC dataset ``--labels`` argument should be specified and additional changes in the original exporting script are required. In the `file `__ change line 121 from ``self.offset = 16`` to ``self.offset = 20``. Then run: - - .. code-block:: sh - - python3 flow --model yolov2-voc.cfg --load yolov2-voc.weights --labels voc-labels.txt --savepb - - -VOC labels can be found on the following `link `__ - -General conversion command is: - -.. code-block:: sh - - python3 flow --model /.cfg --load /.weights --labels --savepb - - -For YOLOv1, the ``--labels`` argument can be skipped. If the model was successfully converted, you can find the ``.meta`` and ``.pb`` files. -in ``built_graph`` subdirectory of the cloned DarkFlow repository. - -File ``.pb`` is a TensorFlow representation of the YOLO model. - -Converting a TensorFlow YOLOv1 or YOLOv2 Model to the IR -++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -Converted TensorFlow YOLO model is missing ``Region`` layer and its parameters. Original YOLO ``Region`` layer parameters are stored in the configuration ``/.cfg`` file under the ``[region]`` title. - -To recreate the original model structure, use the corresponding yolo ``.json`` configuration file with custom operations and ``Region`` layer parameters when converting the model to the IR. This file is located in the ``/tools/model_optimizer/extensions/front/tf`` directory. - -If chosen model has specific values of these parameters, create another configuration file with custom operations and use it for conversion. - -To generate the IR of the YOLOv1 model, provide TensorFlow YOLOv1 or YOLOv2 model to model conversion API with the following parameters: - -.. code-block:: sh - - mo - --input_model /.pb \ - --batch 1 \ - --scale 255 \ - --transformations_config front/tf/.json - - -where: - -* ``batch`` defines shape of model input. In the example, ``batch`` is equal to 1, but you can also specify other integers larger than 1. -* ``scale`` specifies scale factor that input values will be divided by. The model was trained with input values in the range ``[0,1]``. OpenVINO toolkit samples read input images as values in ``[0,255]`` range, so the scale 255 must be applied. -* ``transformations_config`` adds missing ``Region`` layers to the model. In the IR, the ``Region`` layer has name ``RegionYolo``. For other applicable parameters, refer to the :doc:`Convert Model from TensorFlow <../[legacy]-convert-tensorflow>` guide. - -.. note:: - - The color channel order (RGB or BGR) of an input data should match the channel order of the model training dataset. If they are different, perform the ``RGB<->BGR`` conversion specifying the command-line parameter: ``reverse_input_channels``. Otherwise, inference results may be incorrect. For more information about the parameter, refer to the **When to Reverse Input Channels** section of the :doc:`Converting a Model to Intermediate Representation (IR) <../../[legacy]-setting-input-shapes>` guide. - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-onnx.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-onnx.rst deleted file mode 100644 index a864a037d488b7..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-onnx.rst +++ /dev/null @@ -1,70 +0,0 @@ -[LEGACY] Converting an ONNX Model -============================================= - -.. meta:: - :description: Learn how to convert a model from the - ONNX format to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Converting an ONNX Model <../../../../../openvino-workflow/model-preparation/convert-model-onnx>` article. - - -.. note:: ONNX models are supported via FrontEnd API. You may skip conversion to IR and read models directly by OpenVINO runtime API. Refer to the :doc:`inference example <../../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` for more details. Using ``convert_model`` is still necessary in more complex cases, such as new custom inputs/outputs in model pruning, adding pre-processing, or using Python conversion extensions. - -Converting an ONNX Model -######################## - -The model conversion process assumes you have an ONNX model that was directly downloaded from a public repository or converted from any framework that supports exporting to the ONNX format. - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - To convert an ONNX model, run ``convert_model()`` method with the path to the ``.onnx`` file: - - .. code-block:: py - :force: - - import openvino - from openvino.tools.mo import convert_model - - core = openvino.Core() - ov_model = convert_model(".onnx") - compiled_model = core.compile_model(ov_model, "AUTO") - - .. important:: - - The ``convert_model()`` method returns ``ov.Model`` that you can optimize, compile, or save to a file for subsequent use. - - .. tab-item:: CLI - :sync: cli - - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. - - .. code-block:: sh - - mo --input_model .onnx - - -There are no ONNX-specific parameters, so only framework-agnostic parameters are available to convert your model. For details, see the *General Conversion Parameters* section in the :doc:`Converting a Model to Intermediate Representation (IR) <../[legacy]-setting-input-shapes>` guide. - -Supported ONNX Layers -##################### - -For the list of supported standard layers, refer to the :doc:`Supported Operations <../../../../../about-openvino/compatibility-and-support/supported-operations>` page. - -Additional Resources -#################### - -See the :doc:`Model Conversion Tutorials <[legacy]-conversion-tutorials>` page for a set of tutorials providing step-by-step instructions for converting specific ONNX models. Here are some examples: - -* :doc:`Convert ONNX Faster R-CNN Model <[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn>` -* :doc:`Convert ONNX GPT-2 Model <[legacy]-conversion-tutorials/convert-onnx-gpt-2>` -* :doc:`Convert ONNX Mask R-CNN Model <[legacy]-conversion-tutorials/convert-onnx-mask-r-cnn>` - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-paddle.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-paddle.rst deleted file mode 100644 index 041a14f93547b6..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-paddle.rst +++ /dev/null @@ -1,139 +0,0 @@ -[LEGACY] Converting a PaddlePaddle Model -====================================================== - - -.. meta:: - :description: Learn how to convert a model from the - PaddlePaddle format to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Converting a PaddlePaddle Model <../../../../../openvino-workflow/model-preparation/convert-model-paddle>` article. - - -This page provides general instructions on how to convert a model from a PaddlePaddle format to the OpenVINO IR format using Model Optimizer. The instructions are different depending on PaddlePaddle model format. - -.. note:: PaddlePaddle models are supported via FrontEnd API. You may skip conversion to IR and read models directly by OpenVINO runtime API. Refer to the :doc:`inference example <../../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` for more details. Using ``convert_model`` is still necessary in more complex cases, such as new custom inputs/outputs in model pruning, adding pre-processing, or using Python conversion extensions. - -Converting PaddlePaddle Model Inference Format -############################################## - -PaddlePaddle inference model includes ``.pdmodel`` (storing model structure) and ``.pdiparams`` (storing model weight). For how to export PaddlePaddle inference model, please refer to the `Exporting PaddlePaddle Inference Model `__ Chinese guide. - - -To convert a PaddlePaddle model, use the ``mo`` script and specify the path to the input ``.pdmodel`` model file: - -.. code-block:: sh - - mo --input_model .pdmodel - -**For example**, this command converts a yolo v3 PaddlePaddle network to OpenVINO IR network: - -.. code-block:: sh - - mo --input_model=yolov3.pdmodel --input=image,im_shape,scale_factor --input_shape=[1,3,608,608],[1,2],[1,2] --reverse_input_channels --output=save_infer_model/scale_0.tmp_1,save_infer_model/scale_1.tmp_1 - -Converting PaddlePaddle Model From Memory Using Python API -########################################################## - -Model conversion API supports passing the following PaddlePaddle models directly from memory: - -* ``paddle.hapi.model.Model`` -* ``paddle.fluid.dygraph.layers.Layer`` -* ``paddle.fluid.executor.Executor`` - -When you convert certain PaddlePaddle models, you may need to set the ``example_input`` or ``example_output`` parameters first. Below you will find examples that show how to convert aforementioned model formats using the parameters. - -* ``paddle.hapi.model.Model`` - - .. code-block:: py - :force: - - import paddle - from openvino.tools.mo import convert_model - - # create a paddle.hapi.model.Model format model - resnet50 = paddle.vision.models.resnet50() - x = paddle.static.InputSpec([1,3,224,224], 'float32', 'x') - y = paddle.static.InputSpec([1,1000], 'float32', 'y') - - model = paddle.Model(resnet50, x, y) - - # convert to OpenVINO IR format - ov_model = convert_model(model) - - # optional: serialize OpenVINO IR to *.xml & *.bin - from openvino.runtime import serialize - serialize(ov_model, "ov_model.xml", "ov_model.bin") - -* ``paddle.fluid.dygraph.layers.Layer`` - - ``example_input`` is required while ``example_output`` is optional, and accept the following formats: - - ``list`` with tensor(``paddle.Tensor``) or InputSpec(``paddle.static.input.InputSpec``) - - .. code-block:: py - :force: - - import paddle - from openvino.tools.mo import convert_model - - # create a paddle.fluid.dygraph.layers.Layer format model - model = paddle.vision.models.resnet50() - x = paddle.rand([1,3,224,224]) - - # convert to OpenVINO IR format - ov_model = convert_model(model, example_input=[x]) - -* ``paddle.fluid.executor.Executor`` - - ``example_input`` and ``example_output`` are required, and accept the following formats: - - ``list`` or ``tuple`` with variable(``paddle.static.data``) - - .. code-block:: py - :force: - - import paddle - from openvino.tools.mo import convert_model - - paddle.enable_static() - - # create a paddle.fluid.executor.Executor format model - x = paddle.static.data(name="x", shape=[1,3,224]) - y = paddle.static.data(name="y", shape=[1,3,224]) - relu = paddle.nn.ReLU() - sigmoid = paddle.nn.Sigmoid() - y = sigmoid(relu(x)) - - exe = paddle.static.Executor(paddle.CPUPlace()) - exe.run(paddle.static.default_startup_program()) - - # convert to OpenVINO IR format - ov_model = convert_model(exe, example_input=[x], example_output=[y]) - - -.. important:: - - The ``convert_model()`` method returns ``ov.Model`` that you can optimize, compile, or save to a file for subsequent use. - - -Supported PaddlePaddle Layers -############################# - -For the list of supported standard layers, refer to the :doc:`Supported Operations <../../../../../about-openvino/compatibility-and-support/supported-operations>` page. - -Frequently Asked Questions (FAQ) -################################ - -The model conversion API displays explanatory messages for typographical errors, incorrectly used options, or other issues. They describe the potential cause of the problem and give a link to the :doc:`Model Optimizer FAQ <../[legacy]-model-optimizer-faq>`, which provides instructions on how to resolve most issues. The FAQ also includes links to relevant sections in :doc:`Convert a Model <../../legacy-conversion-api>` to help you understand what went wrong. - -Additional Resources -#################### - -See the :doc:`Model Conversion Tutorials <[legacy]-conversion-tutorials>` page for a set of tutorials providing step-by-step instructions for converting specific PaddlePaddle models. - - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-pytorch.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-pytorch.rst deleted file mode 100644 index 2ab66a49cd3546..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-pytorch.rst +++ /dev/null @@ -1,111 +0,0 @@ -[LEGACY] Converting a PyTorch Model -============================================ - - -.. meta:: - :description: Learn how to convert a model from the - PyTorch format to the OpenVINO Intermediate Representation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Converting a PyTorch Model <../../../../../openvino-workflow/model-preparation/convert-model-pytorch>` article. - -This page provides instructions on how to convert a model from the PyTorch format to the OpenVINO IR format. - -The conversion is a required step to run inference using OpenVINO API. -It is not required if you choose to work with OpenVINO under the PyTorch framework, -using its :doc:`torch.compile feature <../../../../../openvino-workflow/torch-compile>`. - -Converting a PyTorch model with PyTorch Frontend -############################################################### - -To convert a PyTorch model to the OpenVINO IR format, use the OVC API (superseding the previously used tool, MO). To do so, use the ``convert_model()`` method, like so: - - -.. code-block:: py - :force: - - import torchvision - import torch - from openvino.tools.mo import convert_model - - model = torchvision.models.resnet50(weights='DEFAULT') - ov_model = convert_model(model) - -Following PyTorch model formats are supported: - -* ``torch.nn.Module`` -* ``torch.jit.ScriptModule`` -* ``torch.jit.ScriptFunction`` - -Converting certain PyTorch models may require model tracing, which needs the ``example_input`` -parameter to be set, for example: - -.. code-block:: py - :force: - - import torchvision - import torch - from openvino.tools.mo import convert_model - - model = torchvision.models.resnet50(weights='DEFAULT') - ov_model = convert_model(model, example_input=torch.randn(1, 3, 100, 100)) - -``example_input`` accepts the following formats: - -* ``openvino.runtime.Tensor`` -* ``torch.Tensor`` -* ``np.ndarray`` -* ``list`` or ``tuple`` with tensors (``openvino.runtime.Tensor`` / ``torch.Tensor`` / ``np.ndarray``) -* ``dictionary`` where key is the input name, value is the tensor (``openvino.runtime.Tensor`` / ``torch.Tensor`` / ``np.ndarray``) - -Sometimes ``convert_model`` will produce inputs of the model with dynamic rank or dynamic type. -Such model may not be supported by the hardware chosen for inference. To avoid this issue, -use the ``input`` argument of ``convert_model``. For more information, refer to :doc:`Convert Models Represented as Python Objects <../[legacy]-convert-models-as-python-objects>`. - -.. important:: - - The ``convert_model()`` method returns ``ov.Model`` that you can optimize, compile, or save to a file for subsequent use. - -Exporting a PyTorch Model to ONNX Format -######################################## - -It is also possible to export a PyTorch model to ONNX and then convert it to OpenVINO IR. To convert and deploy a PyTorch model this way, follow these steps: - -1. `Export a PyTorch model to ONNX <#exporting-a-pytorch-model-to-onnx-format>`__. -2. :doc:`Convert an ONNX model <[legacy]-convert-onnx>` to produce an optimized :doc:`Intermediate Representation <../../../../openvino-ir-format/operation-sets>` of the model based on the trained network topology, weights, and biases values. - -PyTorch models are defined in Python. To export them, use the ``torch.onnx.export()`` method. The code to -evaluate or test the model is usually provided with its code and can be used for its initialization and export. -The export to ONNX is crucial for this process, but it is covered by PyTorch framework, therefore, It will not be covered here in detail. -For more information, refer to the `Exporting PyTorch models to ONNX format `__ guide. - -To export a PyTorch model, you need to obtain the model as an instance of ``torch.nn.Module`` class and call the ``export`` function. - -.. code-block:: py - :force: - - import torch - - # Instantiate your model. This is just a regular PyTorch model that will be exported in the following steps. - model = SomeModel() - # Evaluate the model to switch some operations from training mode to inference. - model.eval() - # Create dummy input for the model. It will be used to run the model inside export function. - dummy_input = torch.randn(1, 3, 224, 224) - # Call the export function - torch.onnx.export(model, (dummy_input, ), 'model.onnx') - - -Additional Resources -#################### - -See the :doc:`Model Conversion Tutorials <[legacy]-conversion-tutorials>` page for a set of tutorials providing step-by-step instructions for converting specific PyTorch models. Here are some examples: - -* :doc:`Convert PyTorch BERT-NER Model <[legacy]-conversion-tutorials/convert-pytorch-bert-ner>` -* :doc:`Convert PyTorch RCAN Model <[legacy]-conversion-tutorials/convert-pytorch-rcan>` -* :doc:`Convert PyTorch YOLACT Model <[legacy]-conversion-tutorials/convert-pytorch-yolact>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow-lite.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow-lite.rst deleted file mode 100644 index 6d9256cdf09994..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow-lite.rst +++ /dev/null @@ -1,37 +0,0 @@ -[LEGACY] Converting a TensorFlow Lite Model -===================================================== - - -.. meta:: - :description: Learn how to convert a model from a - TensorFlow Lite format to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Converting a TensorFlow Lite Model <../../../../../openvino-workflow/model-preparation/convert-model-tensorflow-lite>` article. - -To convert a TensorFlow Lite model, use the ``mo`` script and specify the path to the input ``.tflite`` model file: - -.. code-block:: sh - - mo --input_model .tflite - -TensorFlow Lite models are supported via FrontEnd API. You may skip conversion to IR and read models directly by OpenVINO runtime API. Refer to the :doc:`inference example <../../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` for more details. Using ``convert_model`` is still necessary in more complex cases, such as new custom inputs/outputs in model pruning, adding pre-processing, or using Python conversion extensions. - -.. important:: - - The ``convert_model()`` method returns ``ov.Model`` that you can optimize, compile, or save to a file for subsequent use. - -Supported TensorFlow Lite Layers -################################### - -For the list of supported standard layers, refer to the :doc:`Supported Operations <../../../../../about-openvino/compatibility-and-support/supported-operations>` page. - -Supported TensorFlow Lite Models -################################### - -More than eighty percent of public TensorFlow Lite models are supported from open sources `TensorFlow Hub `__ and `MediaPipe `__. -Unsupported models usually have custom TensorFlow Lite operations. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow.rst deleted file mode 100644 index 2bcb6fde9b833b..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow.rst +++ /dev/null @@ -1,359 +0,0 @@ -[LEGACY] Converting a TensorFlow Model -============================================ - -.. meta:: - :description: Learn how to convert a model from a - TensorFlow format to the OpenVINO Intermediate Representation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Converting a TensorFlow Model <../../../../../openvino-workflow/model-preparation/convert-model-tensorflow>` article. - - -.. note:: TensorFlow models are supported via FrontEnd API. You may skip conversion to IR and read models directly by OpenVINO runtime API. Refer to the :doc:`inference example <../../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` for more details. Using ``convert_model`` is still necessary in more complex cases, such as new custom inputs/outputs in model pruning, adding pre-processing, or using Python conversion extensions. - -The conversion instructions are different depending on whether your model was created with TensorFlow v1.X or TensorFlow v2.X. - -Converting TensorFlow 1 Models -############################### - -Converting Frozen Model Format -+++++++++++++++++++++++++++++++ - -To convert a TensorFlow model, use the ``*mo*`` script to simply convert a model with a path to the input model *.pb* file: - -.. code-block:: sh - - mo --input_model .pb - - -Converting Non-Frozen Model Formats -+++++++++++++++++++++++++++++++++++ - -There are three ways to store non-frozen TensorFlow models and convert them by model conversion API: - -1. **Checkpoint**. In this case, a model consists of two files: ``inference_graph.pb`` (or ``inference_graph.pbtxt``) and ``checkpoint_file.ckpt``. -If you do not have an inference graph file, refer to the `Freezing Custom Models in Python <#freezing-custom-models-in-python>`__ section. -To convert the model with the inference graph in ``.pb`` format, run the `mo` script with a path to the checkpoint file: - -.. code-block:: sh - - mo --input_model .pb --input_checkpoint - -To convert the model with the inference graph in ``.pbtxt`` format, run the ``mo`` script with a path to the checkpoint file: - -.. code-block:: sh - - mo --input_model .pbtxt --input_checkpoint --input_model_is_text - - -2. **MetaGraph**. In this case, a model consists of three or four files stored in the same directory: ``model_name.meta``, ``model_name.index``, -``model_name.data-00000-of-00001`` (the numbers may vary), and ``checkpoint`` (optional). -To convert such TensorFlow model, run the `mo` script with a path to the MetaGraph ``.meta`` file: - -.. code-block:: sh - - mo --input_meta_graph .meta - - -3. **SavedModel format**. In this case, a model consists of a special directory with a ``.pb`` file -and several subfolders: ``variables``, ``assets``, and ``assets.extra``. For more information about the SavedModel directory, refer to the `README `__ file in the TensorFlow repository. -To convert such TensorFlow model, run the ``mo`` script with a path to the SavedModel directory: - -.. code-block:: sh - - mo --saved_model_dir - - -You can convert TensorFlow 1.x SavedModel format in the environment that has a 1.x or 2.x version of TensorFlow. However, TensorFlow 2.x SavedModel format strictly requires the 2.x version of TensorFlow. -If a model contains operations currently unsupported by OpenVINO, prune these operations by explicit specification of input nodes using the ``--input`` option. -To determine custom input nodes, display a graph of the model in TensorBoard. To generate TensorBoard logs of the graph, use the ``--tensorboard_logs`` option. -TensorFlow 2.x SavedModel format has a specific graph due to eager execution. In case of pruning, find custom input nodes in the ``StatefulPartitionedCall/*`` subgraph of TensorFlow 2.x SavedModel format. - -Freezing Custom Models in Python -++++++++++++++++++++++++++++++++ - -When a network is defined in Python code, you have to create an inference graph file. Graphs are usually built in a form -that allows model training. That means all trainable parameters are represented as variables in the graph. -To be able to use such graph with model conversion API, it should be frozen and dumped to a file with the following code: - -.. code-block:: py - :force: - - import tensorflow as tf - from tensorflow.python.framework import graph_io - frozen = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ["name_of_the_output_node"]) - graph_io.write_graph(frozen, './', 'inference_graph.pb', as_text=False) - -Where: - -* ``sess`` is the instance of the TensorFlow Session object where the network topology is defined. -* ``["name_of_the_output_node"]`` is the list of output node names in the graph; ``frozen`` graph will include only those nodes from the original ``sess.graph_def`` that are directly or indirectly used to compute given output nodes. The ``'name_of_the_output_node'`` is an example of a possible output node name. You should derive the names based on your own graph. -* ``./`` is the directory where the inference graph file should be generated. -* ``inference_graph.pb`` is the name of the generated inference graph file. -* ``as_text`` specifies whether the generated file should be in human readable text format or binary. - -Converting TensorFlow 2 Models -############################### - -To convert TensorFlow 2 models, ensure that `openvino-dev[tensorflow2]` is installed via `pip`. -TensorFlow 2.X officially supports two model formats: SavedModel and Keras H5 (or HDF5). -Below are the instructions on how to convert each of them. - -SavedModel Format -+++++++++++++++++ - -A model in the SavedModel format consists of a directory with a ``saved_model.pb`` file and two subfolders: ``variables`` and ``assets``. -To convert such a model, run the `mo` script with a path to the SavedModel directory: - -.. code-block:: sh - - mo --saved_model_dir - -TensorFlow 2 SavedModel format strictly requires the 2.x version of TensorFlow installed in the -environment for conversion to the Intermediate Representation (IR). - -If a model contains operations currently unsupported by OpenVINO™, -prune these operations by explicit specification of input nodes using the ``--input`` or ``--output`` -options. To determine custom input nodes, visualize a model graph in the TensorBoard. - -TensorFlow 2 SavedModel format has a specific graph structure due to eager execution. In case of -pruning, find custom input nodes in the ``StatefulPartitionedCall/*`` subgraph. - -Since the 2023.0 release, direct pruning of models in SavedModel format is not supported. -It is essential to freeze the model before pruning. Use the following code snippet for model freezing: - -.. code-block:: py - :force: - - import tensorflow as tf - from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 - saved_model_dir = "./saved_model" - imported = tf.saved_model.load(saved_model_dir) - # retrieve the concrete function and freeze - concrete_func = imported.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY] - frozen_func = convert_variables_to_constants_v2(concrete_func, - lower_control_flow=False, - aggressive_inlining=True) - # retrieve GraphDef and save it into .pb format - graph_def = frozen_func.graph.as_graph_def(add_shapes=True) - tf.io.write_graph(graph_def, '.', 'model.pb', as_text=False) - -Keras H5 -++++++++ - -If you have a model in HDF5 format, load the model using TensorFlow 2 and serialize it to -SavedModel format. Here is an example of how to do it: - -.. code-block:: py - :force: - - import tensorflow as tf - model = tf.keras.models.load_model('model.h5') - tf.saved_model.save(model,'model') - - -The Keras H5 model with a custom layer has specifics to be converted into SavedModel format. -For example, the model with a custom layer ``CustomLayer`` from ``custom_layer.py`` is converted as follows: - -.. code-block:: py - :force: - - import tensorflow as tf - from custom_layer import CustomLayer - model = tf.keras.models.load_model('model.h5', custom_objects={'CustomLayer': CustomLayer}) - tf.saved_model.save(model,'model') - - -Then follow the above instructions for the SavedModel format. - -.. note:: - - Do not use other hacks to resave TensorFlow 2 models into TensorFlow 1 formats. - -Command-Line Interface (CLI) Examples Using TensorFlow-Specific Parameters -########################################################################## - -* Launching model conversion for Inception V1 frozen model when model file is a plain text protobuf: - - .. code-block:: sh - - mo --input_model inception_v1.pbtxt --input_model_is_text -b 1 - - -* Launching model conversion for Inception V1 frozen model and dump information about the graph to TensorBoard log dir ``/tmp/log_dir`` - - .. code-block:: sh - - mo --input_model inception_v1.pb -b 1 --tensorboard_logdir /tmp/log_dir - - -* Launching model conversion for BERT model in the SavedModel format, with three inputs. Specify explicitly the input shapes where the batch size and the sequence length equal 2 and 30 respectively. - - .. code-block:: sh - - mo --saved_model_dir BERT --input mask,word_ids,type_ids --input_shape [2,30],[2,30],[2,30] - -Conversion of TensorFlow models from memory using Python API -############################################################ - -Model conversion API supports passing TensorFlow/TensorFlow2 models directly from memory. - -* ``tf.keras.Model`` - - .. code-block:: py - :force: - - import tensorflow as tf - from openvino.tools.mo import convert_model - - model = tf.keras.applications.ResNet50(weights="imagenet") - ov_model = convert_model(model) - - -* ``tf.keras.layers.Layer``. Requires setting the "input_shape". - - .. code-block:: py - :force: - - import tensorflow_hub as hub - from openvino.tools.mo import convert_model - - model = hub.KerasLayer("https://tfhub.dev/google/imagenet/mobilenet_v1_100_224/classification/5") - ov_model = convert_model(model, input_shape=[-1, 224, 224, 3]) - -* ``tf.Module``. Requires setting the "input_shape". - - .. code-block:: py - :force: - - import tensorflow as tf - from openvino.tools.mo import convert_model - - class MyModule(tf.Module): - def __init__(self, name=None): - super().__init__(name=name) - self.variable1 = tf.Variable(5.0, name="var1") - self.variable2 = tf.Variable(1.0, name="var2") - def __call__(self, x): - return self.variable1 * x + self.variable2 - - model = MyModule(name="simple_module") - ov_model = convert_model(model, input_shape=[-1]) - -* ``tf.compat.v1.Graph`` - - .. code-block:: py - :force: - - import tensorflow as tf - from openvino.tools.mo import convert_model - - with tf.compat.v1.Session() as sess: - inp1 = tf.compat.v1.placeholder(tf.float32, [100], 'Input1') - inp2 = tf.compat.v1.placeholder(tf.float32, [100], 'Input2') - output = tf.nn.relu(inp1 + inp2, name='Relu') - tf.compat.v1.global_variables_initializer() - model = sess.graph - - ov_model = convert_model(model) - -* ``tf.compat.v1.GraphDef`` - - .. code-block:: py - :force: - - import tensorflow as tf - from openvino.tools.mo import convert_model - - with tf.compat.v1.Session() as sess: - inp1 = tf.compat.v1.placeholder(tf.float32, [100], 'Input1') - inp2 = tf.compat.v1.placeholder(tf.float32, [100], 'Input2') - output = tf.nn.relu(inp1 + inp2, name='Relu') - tf.compat.v1.global_variables_initializer() - model = sess.graph_def - - ov_model = convert_model(model) - -* ``tf.function`` - - .. code-block:: py - :force: - - import tensorflow as tf - from openvino.tools.mo import convert_model - - @tf.function( - input_signature=[tf.TensorSpec(shape=[1, 2, 3], dtype=tf.float32), - tf.TensorSpec(shape=[1, 2, 3], dtype=tf.float32)]) - def func(x, y): - return tf.nn.sigmoid(tf.nn.relu(x + y)) - - ov_model = convert_model(func) - -* ``tf.compat.v1.session`` - - .. code-block:: py - :force: - - import tensorflow as tf - from openvino.tools.mo import convert_model - - with tf.compat.v1.Session() as sess: - inp1 = tf.compat.v1.placeholder(tf.float32, [100], 'Input1') - inp2 = tf.compat.v1.placeholder(tf.float32, [100], 'Input2') - output = tf.nn.relu(inp1 + inp2, name='Relu') - tf.compat.v1.global_variables_initializer() - - ov_model = convert_model(sess) - -* ``tf.train.checkpoint`` - - .. code-block:: py - :force: - - import tensorflow as tf - from openvino.tools.mo import convert_model - - model = tf.keras.Model(...) - checkpoint = tf.train.Checkpoint(model) - save_path = checkpoint.save(save_directory) - # ... - checkpoint.restore(save_path) - ov_model = convert_model(checkpoint) - -.. important:: - - The ``convert_model()`` method returns ``ov.Model`` that you can optimize, compile, or save to a file for subsequent use. - -Supported TensorFlow and TensorFlow 2 Keras Layers -################################################## - -For the list of supported standard layers, refer to the :doc:`Supported Operations <../../../../../about-openvino/compatibility-and-support/supported-operations>` page. - -Frequently Asked Questions (FAQ) -################################ - -The model conversion API provides explanatory messages if it is unable to run to completion due to typographical errors, incorrectly used options, or other issues. The message describes the potential cause of the problem and gives a link to the :doc:`Model Optimizer FAQ <../[legacy]-model-optimizer-faq>`. The FAQ provides instructions on how to resolve most issues. The FAQ also includes links to relevant sections in :doc:`Convert a Model <../../legacy-conversion-api>` to help you understand what went wrong. - -Summary -####### - -In this document, you learned: - -* Basic information about how the model conversion API works with TensorFlow models. -* Which TensorFlow models are supported. -* How to freeze a TensorFlow model. -* How to convert a trained TensorFlow model using model conversion API with both framework-agnostic and TensorFlow-specific command-line parameters. - -Additional Resources -#################### - -See the :doc:`Model Conversion Tutorials <[legacy]-conversion-tutorials>` page for a set of tutorials providing step-by-step instructions for converting specific TensorFlow models. Here are some examples: - -* :doc:`Convert TensorFlow EfficientDet Models <[legacy]-conversion-tutorials/convert-tensorflow-efficient-det>` -* :doc:`Convert TensorFlow FaceNet Models <[legacy]-conversion-tutorials/convert-tensorflow-face-net>` -* :doc:`Convert TensorFlow Object Detection API Models <[legacy]-conversion-tutorials/convert-tensorflow-object-detection>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst deleted file mode 100644 index 4d5c282a947d1b..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-troubleshooting-reshape-errors.rst +++ /dev/null @@ -1,54 +0,0 @@ -[LEGACY] Troubleshooting Reshape Errors -======================================= - - -.. meta:: - :description: In OpenVINO™, you can use several methods to address the issues - of non-reshape-able models and shape collision, which prevent - normal shape propagation. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - -How To Avoid Shape Collision -############################ - -Operation semantics may impose restrictions on input shapes of the operation. -Shape collision during shape propagation may be a sign that new shape does not satisfy the restrictions. -Changing the model input shape may result in intermediate operations shape collision. For example, in the following: - -* The :doc:`Reshape <../../../openvino-ir-format/operation-sets/operation-specs/shape/reshape-1>` operation with a hard-coded output shape value, -* The :doc:`MatMul <../../../openvino-ir-format/operation-sets/operation-specs/matrix/matmul-1>` operation with the ``Const`` second input and this input cannot be resized by spatial dimensions due to operation semantics. - -Model structure and logic should not change significantly after model reshaping. - -* The Global Pooling operation is commonly used to reduce output feature map of classification models output. Having the input of the shape *[N, C, H, W]*, Global Pooling returns the output of the shape *[N, C, 1, 1]*. Model architects usually express Global Pooling with the help of the ``Pooling`` operation with the fixed kernel size *[H, W]*. During spatial reshape, having the input of the shape *[N, C, H1, W1]*, ``Pooling`` with the fixed kernel size *[H, W]* returns the output of the shape *[N, C, H2, W2]*, where *H2* and *W2* are commonly not equal to *1*. It breaks the classification model structure. For example, the public `Inception family models from TensorFlow `__ have this issue. - -* Changing the model input shape may significantly affect its accuracy. For example, Object Detection models from TensorFlow have resizing restrictions by design. To keep the model valid after the reshape, choose a new input shape that satisfies conditions listed in the ``pipeline.config`` file. - -.. _how-to-fix-non-reshape-able-model: - -How To Fix Non-Reshape-able Model -################################# - -To fix some operators which prevent normal shape propagation: - -* see if the issue can be fixed via changing the values of some operators' input. For example, the most common problem of non-reshape-able models is a ``Reshape`` operator with a hard-coded output shape. You can cut-off the hard-coded second input of ``Reshape`` and fill it in with relaxed values. For the following example in the diagram below, the model conversion API command line should read: - - .. code-block:: sh - - mo --input_model path/to/model --input data[8,3,224,224],1:reshaped[2]->[0,-1]` - - - With ``1:reshaped[2]``, it is required to cut the second input (counting from zero, so ``1:`` means the second input) of the operation named ``reshaped`` and replace it with a ``Parameter`` with shape ``[2]``. - With ``->[0 -1]``, this new ``Parameter`` is replaced by a ``Constant`` operator which has the ``[0, -1]`` value. - Since the ``Reshape`` operator has ``0`` and ``-1`` as specific values, it allows propagating shapes freely without losing the intended meaning of ``Reshape``. For more information, see :doc:`the specification <../../../openvino-ir-format/operation-sets/operation-specs/shape/reshape-1>`. - - .. image:: ../../../../assets/images/batch_relaxation.png - -* transform the model conversion on the back phase. For more information, see the :doc:`How to Convert a Model <../legacy-model-optimizer-extensibility>`, -* transform OpenVINO Model during the runtime. For more information, see :doc:`OpenVINO Runtime Transformations <../../../openvino-extensibility/transformation-api>`, -* modify the original model with the help of the original framework. - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst deleted file mode 100644 index 3d2365f45ffe3b..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst +++ /dev/null @@ -1,326 +0,0 @@ -Legacy Model Optimizer Extensibility -==================================== - - - -.. toctree:: - :maxdepth: 1 - :hidden: - - legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification - legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions - legacy-model-optimizer-extensibility/[legacy]-extending-model-optimizer-with-caffe-python-layers - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated TensorFlow conversion method. The guide on the new and recommended method, using a new frontend, can be found in the :doc:`Frontend Extensions <../../openvino-extensibility/frontend-extensions>` article. - -This article describes Model Optimizer internals. Altering them may result in application instability, and in case of future changes to the API, lack of backward compatibility. - -.. note:: - If you want to add support for ONNX, TensorFlow Lite, PaddlePaddle or TensorFlow operations, or you are not familiar with other extension alternatives in OpenVINO, read :doc:`this guide <../../openvino-extensibility>` instead. - -.. _model-optimizer-extensibility: - -Model Optimizer extensibility mechanism enables support of new operations and custom transformations to generate the optimized intermediate representation (IR) as described :doc:`here <../../openvino-ir-format/operation-sets>`. -This mechanism is a core part of Model Optimizer, as a huge set of examples showing how to add custom logic to support your model. - -There are several cases when the customization is needed: - -* A model contains operation(s) not known for the Model Optimizer, but these operation(s) could be expressed as a combination of supported operations. In this case, a custom transformation should be implemented to replace unsupported operation(s) with supported ones. -* A model contains a sub-graph of operations that can be replaced with a smaller number of operations to get better performance. This example corresponds to so-called *fusing transformations* (e.g., replacing a sub-graph performing the calculation :math:`x/(1.0+e^{-(beta*x)})` with a single operation of type :doc:`Swish <../../openvino-ir-format/operation-sets/operation-specs/activation/swish-4>`. -* A model contains a custom framework operation (the operation that is not a part of an official operation set of the framework) that was developed using the framework extensibility mechanism. In this case, Model Optimizer should know how to handle the operation and generate a corresponding section in an IR for it. - -It is necessary to figure out how Model Optimizer represents a model in a memory and converts it to an IR before -going into details of the Model Optimizer extensibility mechanism. - -.. note:: - All paths in this article are provided relatively to the Model Optimizer installation directory if not stated otherwise. - -.. _mo_model_representation_in_memory: - -============================== -Model Representation in Memory -============================== - -The model can be represented as a directed graph, where nodes are operations and edges correspond to data passing from a -producer operation (node) to a consumer operation (node). - -Model Optimizer uses Python class ``mo.graph.graph.Graph`` instance to represent the computation graph in memory during -the model conversion. This class is inherited from the ``networkx.MultiDiGraph`` class of the standard ``networkx`` Python -library. It provides many convenient methods to traverse and modify the graph. Refer to the ``mo/graph/graph.py`` file for examples. - -Model Optimizer keeps all necessary information about the operation in node attributes. Model Optimizer uses the ``mo.graph.graph.Node`` class defined in the ``mo/graph/graph.py`` file, which is a wrapper on top of a ``networkx`` node attributes -dictionary, and provides many convenient methods to work with the node. For example, the node ``my_node`` attribute with a -name ``my_attr`` can be retrieved from the node with the following code ``my_node.my_attr``, which is equivalent to obtaining -attribute with name ``my_attr`` in the ``graph.node[my_node]`` dictionary. For the class implementation details, refer to the ``mo/graph/graph.py`` file. - -An operation may have several inputs and outputs. For example, operation :doc:`Split <../../openvino-ir-format/operation-sets/operation-specs/movement/split-1>` has -two inputs: data to split and axis to split along, and variable number of outputs depending on a value of attribute -``num_splits``. Each input data to the operation is passed to a specific operation **input port**. An operation produces -the output data from an **output port**. Input and output ports are numbered from 0 independently. Model Optimizer uses -classes ``mo.graph.port.Port`` and ``mo.graph.connection.Connection``, which are useful abstraction to perform graph -modifications like nodes connecting/re-connecting and graph traversing. These classes are widely used in the Model -Optimizer code so it is easy to find a lot of usage examples. - -There is no dedicated class corresponding to an edge, so low-level graph manipulation is needed to get access to -edge attributes if needed. Meanwhile, most manipulations with nodes connections should be done with help of the -``mo.graph.connection.Connection`` and ``mo.graph.port.Port`` classes. Thus, low-level graph manipulation is error prone and -is strongly not recommended. - -Further details and examples related to a model representation in memory are provided in the sections below, in a context -for a better explanation. For more information on how to use ports and connections, refer to the :doc:`Graph Traversal and Modification Using Ports and Connections ` article. - -.. _mo_model_conversion_pipeline: - -========================= -Model Conversion Pipeline -========================= - -A model conversion pipeline can be represented with the following diagram: - -.. image:: ../../../assets/images/MO_conversion_pipeline.svg - -Each conversion step is reviewed in details below. - -Model Loading -############# - -Model Optimizer gets a trained model file as an input. The model loader component of Model Optimizer reads a model file -using Python bindings provided with the framework and builds an in-memory representation of a computation graph. There -is a separate loader for each supported framework. These loaders are implemented in the -``extensions/load//loader.py`` files of Model Optimizer. - -.. note:: - Model Optimizer uses a special parser for Caffe models built on top of the ``caffe.proto`` file. In the case of a model loading failure, Model Optimizer throws an error and requests preparation of the parser that can read the model. For more information on how to prepare the custom Caffe parser, refer to the :ref:`question #1 ` in the :doc:`Model Optimizer FAQ `. - -The result of a model loading step is a ``Graph`` object, which can be depicted like in the following example: - -.. image:: ../../../assets/images/MO_graph_after_loader.svg - -Model Optimizer loader saves an operation instance framework description (usually it is a Protobuf message) into a node -attribute usually with a name ``pb`` for each operation of an input model. It is important that this is a -**framework-specific** description of an operation. This means that an operation (e.g. -:doc:`Convolution <../../openvino-ir-format/operation-sets/operation-specs/convolution/convolution-1>` may be represented differently in, for example, Caffe and -TensorFlow frameworks but performs the same calculations from a mathematical point of view. - -In the image above, the **Operation 2** has one input and two outputs. The tensor produced from the output **port 0** is -consumed with the **Operation 5** (the input **port 0**) and **Operation 3** (the input **port 1**). The tensor produced from the -output **port 1** is consumed with the **Operation 4** (the input **port 0**). - -Each edge has two attributes: ``in`` and ``out``. They contain the input port number of the consumer node and the output port -number of the producer node. These attributes describe the fact that nodes are operations consuming some input tensors -and producing some output tensors. From the perspective of Model Optimizer, nodes themselves are **black boxes** because -they do not contain required information about the operation they perform. - -Operations Attributes Extracting -################################ - -The next step is to parse framework-dependent operation representation saved in a node attribute and update the node -attributes with the operation specific attributes. There are three options to do this. - -1. The extractor extension approach (recommended way to extract attributes for an operation). Explained in details in the :doc:`Operation Extractor ` article. -2. The legacy approach with a built-in extractor. The ``mo/front//extractor.py`` file (for example, the one for Caffe) defines a dictionary with extractors for specific operation types. A key in the dictionary is a type of an operation to trigger the extracting function for and the value is the function. The function has one parameter – a node to extract attributes from. This is a legacy and non-extensible approach so it should be avoided. This mechanism will be removed in future versions of Model Optimizer. - -The extractors execution order is the following: - -* ``CustomLayersMapping.xml`` (for Caffe models only). -* Model Optimizer extension. -* Built-in Model Optimizer extractor. - -The result of operations attributes extracting step can be depicted like in the following example: - -.. image:: ../../../assets/images/MO_graph_after_extractors.svg - -The only difference in the graph from the previous step is that nodes contain dictionary with extracted attributes and -operation-specific attributes needed for Model Optimizer. However, from this step, Model Optimizer does not -need the original representation of the operation/model and just uses Model Optimizer representation (there are some -peculiar cases in which Model Optimizer still uses the ``pb`` attribute, covered in this -article partially). A detailed list of common node attributes and their values is provided in the -:doc:`Model Optimizer Operation ` article. - -Front Phase -########### - -For legacy reasons, you must specify shapes for all not fully-defined inputs of the model. In contrast, other -machine learning frameworks, like TensorFlow, let you create a model with undefined or partially defined input shapes. -As an example, undefined dimension is marked with an integer value ``-1`` in a TensorFlow model or has some string name -in an ONNX model. - -During the front phase, Model Optimizer knows shape of the model inputs and constants only and does not know shapes -(and even ranks) of the intermediate tensors. But information about shapes may not be needed to implement particular -transformation. For example, the transformation ``extensions/front/TopKNormalize.py`` removes an attribute ``k`` from a -``TopK`` node and adds an input constant with the value ``k``. The transformation is needed to convert a ``TopK`` operation. -It comes from frameworks, where a number of output elements is defined as an attribute of the operation to the -OpenVINO :doc:`TopK <../../openvino-ir-format/operation-sets/operation-specs/sort/top-k-3>` operation semantic, which requires this value to be a separate input. - -It is important to mention that sometimes it seems like transformation cannot be implemented during the front phase -because the actual values of inputs or shapes are needed. In fact, manipulations of shapes or values can be implemented -using operations that are added to the graph. Consider the -``extensions/front/onnx/flattenONNX_to_reshape.py`` transformation, which replaces an ONNX -`Flatten `__ operation with a sub-graph of operations performing -the following (when ``axis`` is not equal to 0 and 1): - -1. Calculate a shape of the ``Flatten`` input tensor, using the :doc:`ShapeOf <../../openvino-ir-format/operation-sets/operation-specs/shape/shape-of-3>` operation. -2. Get the first ``axis`` elements from the output of ``Shape`` operation and calculate their product, using the :doc:`ReduceProd <../../openvino-ir-format/operation-sets/operation-specs/reduction/reduce-prod-1>` operation. -3. Concatenate output of the ``ReduceProd`` and constant with the value of ``-1`` (for an explanation of this value refer to the :doc:`Reshape <../../openvino-ir-format/operation-sets/operation-specs/shape/reshape-1>` specification page). -4. Use the concatenated value as the second input to the ``Reshape`` operation. - -It is highly recommended to write shape-agnostic transformations to avoid model reshape-ability issues. For more information related to the reshaping of a model, refer to the :doc:`Using Shape Inference <../../../openvino-workflow/running-inference/changing-input-shape>` guide. - -More information on how to develop front phase transformations and dedicated API description is provided in the -:ref:`Front Phase Transformations `. - -.. _mo_partial_inference: - -Partial Inference -################# - -Model Optimizer performs a partial inference of a model during model conversion. This procedure includes output shapes -calculation of all operations in a model and constant folding (value calculation for constant sub-graphs). The constant -folding is needed for the shape inference because in some cases evaluation of constant sub-graph is needed to calculate -output shapes. For example, the output shape for the :doc:`Reshape <../../openvino-ir-format/operation-sets/operation-specs/shape/reshape-1>` operation may be -defined as a mathematical expression using the :doc:`ShapeOf <../../openvino-ir-format/operation-sets/operation-specs/shape/shape-of-3>` operation output. - -.. note:: - Model Optimizer does not fold sub-graphs starting from the :doc:`ShapeOf <../../openvino-ir-format/operation-sets/operation-specs/shape/shape-of-3>` operation by default because this leads to a model non-reshape-ability (the command-line parameter ``--static_shape`` can override this behavior). For more information related to reshaping of a model, refer to the :doc:`Using Shape Inference <../../../openvino-workflow/running-inference/changing-input-shape>` guide. - -Model Optimizer calculates output shapes for all operations in a model to write them to Intermediate Representation files. - -.. note:: - This is a legacy requirement. Starting with IR version 10, OpenVINO Runtime needs to know shapes of the :doc:`Const <../../openvino-ir-format/operation-sets/operation-specs/infrastructure/constant-1>` and the :doc:`Parameter <../../openvino-ir-format/operation-sets/operation-specs/infrastructure/parameter-1>` operations only. The OpenVINO Runtime calculates output shapes for all operations in a model, using shapes of :doc:`Parameter <../../openvino-ir-format/operation-sets/operation-specs/infrastructure/parameter-1>` and :doc:`Const <../../openvino-ir-format/operation-sets/operation-specs/infrastructure/constant-1>` operations defined with respective operation attributes. - -Model Optimizer inserts **data** nodes to the computation graph before starting the partial inference phase. The data node -corresponds to the specific tensor produced with the operation. Each data node contains two attributes: ``shape``, -containing the shape of the tensor, and ``value``, which may contain the actual value of the tensor. The value for a ``value`` -attribute is equal to ``None`` if this tensor value cannot be calculated. This happens in two cases: when a tensor value -depends on a values passed to the :doc:`Parameter <../../openvino-ir-format/operation-sets/operation-specs/infrastructure/parameter-1>` operation of a model or -Model Optimizer does not have value propagation implementation for the operation. - -Before running partial inference, the graph can be depicted like in the following example: - -.. image:: ../../../assets/images/MO_graph_before_partial_inference.svg - -The difference in a graph structure with a graph during the front phase is not only in the data nodes, but also in the -edge attributes. Note that an ``out`` attribute is specified for edges **from operation** nodes only, while an ``in`` -attribute is specified for edges **from data** nodes only. This corresponds to the fact that a tensor (data node) is -produced from a specific output port of an operation and is consumed with a specific input port of an operation. Also, -a unique data node is created for each output port of an operation. The node may be used as an input node for several -operation nodes. Similarly to the data node **data2_0**, which is consumed with the input **port 1** of the **Operation 3** and -input **port 0** of the **Operation 5**. - -Now, consider how Model Optimizer performs shape and value propagation. Model Optimizer performs graph nodes -topological sort. An error message is thrown if a graph contains a cycle. Then, shape inference functions are called for -each node in the graph, according to the topological order. Each node of the graph must have an attribute called ``infer`` -with a shape inference function, which is a function with one parameter – an instance of the ``Node`` class. The ``infer`` -attribute is usually set in the operation extractor or when a node is added in some transformation using the Model -Optimizer operation class inherited from the ``mo.pos.Op`` class. For more information on how to specify a shape inference function, -refer to the :doc:`Model Optimizer Operation ` and :doc:`Operation Extractor ` articles. - -A shape inference function should calculate an operation (node) output shape(s) based on input shape(s) and operation -(node) attribute(s) and update ``shape`` and optionally ``value`` attributes of the corresponding data node(s). A simplified -example of the shape infer function for the :doc:`Reshape <../../openvino-ir-format/operation-sets/operation-specs/shape/reshape-1>` operation (the full version is -available in the ``mo/ops/reshape.py`` file): - -.. code-block:: py - :force: - - @staticmethod - def infer(node: Node): - name = node.soft_get('name', node.id) - - input_shape = node.in_port(0).data.get_shape() # get the input tensor shape - new_shape = node.in_port(1).data.get_value() # get the value defining the output tensor shape. This tensor may - # have special values like 0 and -1 - - output_shape = ... # calculate output shape without special values like 0 and -1 - - if node.in_port(0).data.get_value() is not None: # if the input value is defined then calculate output value; - # shape will be updated automatically with the value shape - node.out_port(0).data.set_value(node.in_port(0).data.get_value().reshape(output_shape)) - else: # in the opposite case calculate the output shape only - node.out_port(0).data.set_shape(output_shape) - -Methods ``in_port()`` and ``output_port()`` of the ``Node`` class are used to get and set data node attributes. For more information on -how to use them, refer to the :doc:`Graph Traversal and Modification Using Ports and Connections ` article. - -.. note:: - A shape inference function should perform output shape calculation in the original model layout. For example, OpenVINO™ supports Convolution operations in NCHW layout only but TensorFlow supports NHWC layout as well. Model Optimizer shape inference function calculates output shapes for NHWC Convolutions in NHWC layout and only during the layout change phase the shape is converted to NCHW. - -.. note:: - There is a legacy approach to read data node attribute, like ``input_shape = op_node.in_node(0).shape`` and modify data nodes attributes, like ``op_node.out_node(0).shape = some_value``. This approach is still used in the Model Optimizer code but is not recommended. Instead, use the approach described in the :ref:`Ports `. - -Middle Phase -############ - -The middle phase starts after partial inference. At this phase, a graph contains data nodes and output shapes of all -operations in the graph have been calculated. Any transformation implemented at this stage must update the ``shape`` -attribute for all newly added operations. It is highly recommended to use API described in the -:doc:`Graph Traversal and Modification Using Ports and Connections ` because modification of a graph using this API causes automatic re-inference of affected nodes as well as necessary data nodes creation. - -More information on how to develop middle transformations and dedicated API description is provided in the -:ref:`Middle Phase Transformations `. - -NHWC to NCHW Layout Change -########################## - -There are several middle transformations responsible for changing model layout from NHWC to NCHW. These transformations are triggered by default for TensorFlow models as TensorFlow supports Convolution operations in the NHWC layout. - -This layout change is disabled automatically if the model does not have operations that OpenVINO™ needs to execute in the NCHW layout, for example, Convolutions in NHWC layout. - -For more details on how it works, refer to the source code of the transformations mentioned in the below summary of the process: - -1. Model Optimizer changes output shapes of most of operations producing 4D and 5D (four dimensional and five dimensional) tensors as if they were in NHWC layout to NCHW layout: ``nchw_shape = np.array(nhwc_shape)[0, 3, 1, 2]`` for 4D and ``nchw_shape = np.array(nhwc_shape)[0, 4, 1, 2, 3]`` for 5D. This permutation does not happen for some operations with specific conditions identified during a model conversion. -2. Model Optimizer inserts :doc:`Gather <../../openvino-ir-format/operation-sets/operation-specs/movement/gather-1>` operations to the sub-graph relates to shapes calculation in order to perform shape calculation in a correct layout. -3. Model Optimizer inserts :doc:`Transpose <../../openvino-ir-format/operation-sets/operation-specs/movement/transpose-1>` operations for some operations with specific conditions, identified during a model conversion, to produce correct inference results. - -The main transformations responsible for a layout change are: - -* ``extensions/middle/ApplyPermutations.py`` -* ``extensions/middle/InsertLayoutPropagationTransposes.py`` -* ``extensions/middle/MarkSubgraphsWithCorrectLayout.py`` -* ``extensions/middle/ApplyNHWCtoNCHWpermutation.py`` -* ``extensions/middle/LayoutChangeForConstantShapePaths.py`` - -Back Phase -########## - -The back phase starts after the layout change to NCHW. This phase contains mostly the following transformations: - -1. Transformations that should work with a graph in the NCHW layout and thus cannot be implemented in the middle phase. -2. Transformations that replace nodes corresponding to internal Model Optimizer operations with nodes corresponding to the :doc:`opset <../../openvino-ir-format/operation-sets/available-opsets>` operations. -3. Transformations that normalize operations inputs according to the specification. -4. Final optimization transformations. - -A graph structure during the back phase is the same as during the middle phase. There is no difference in writing middle -and back transformations. - -More information on how to develop back transformations and dedicated API description is provided in the -:ref:`Back Phase Transformations `. - -Intermediate Representation Emitting -#################################### - -The last phase of a model conversion is the Intermediate Representation emitting. Model Optimizer performs the following -steps: - -1. Iterates over all operation nodes in the graph and checks that all nodes have the ``type`` attribute set. This attribute defines the operation type and is used in the OpenVINO to instantiate proper operation from the :doc:`opset <../../openvino-ir-format/operation-sets/available-opsets>` specified in the ``version`` attribute of the node. If a node does not have attribute ``type`` or its value is equal to ``None``, Model Optimizer exits with an error. -2. Performs type inference of graph operations similar to the shape inference. Inferred data types are saved to a port attributes in the IR. -3. Performs topological sort of the graph and changes ``id`` attribute of all operation nodes to be sequential integer values starting from 0. -4. Saves all Constants values to the ``.bin`` file. Constants with the same value are shared among different operations. -5. Generates an ``.xml`` file defining a graph structure. The information about operation inputs and outputs are prepared uniformly for all operations regardless of their type. A list of attributes to be saved to the ``.xml`` file is defined with the ``backend_attrs()`` or ``supported_attrs()`` of the ``Op`` class used for a graph node instantiation. For more information on how the operation attributes are saved to XML, refer to the function ``prepare_emit_ir()`` in the ``mo/pipeline/common.py`` file and :doc:`Model Optimizer Operation ` article. - -==================== -Additional Resources -==================== - -* :doc:`Deep Learning Network Intermediate Representation and Operation Sets in OpenVINO™ <../../openvino-ir-format/operation-sets>` -* :doc:`Converting a Model to Intermediate Representation (IR) ` -* :doc:`OpenVINO Model Representation <../../../openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation>` -* :doc:`OpenVINO™ Extensibility Mechanism <../../openvino-extensibility>` -* :doc:`Graph Traversal and Modification Using Ports and Connections ` -* :doc:`Model Optimizer Extensions ` -* :doc:`Extending Model Optimizer with Caffe Python Layers ` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-extending-model-optimizer-with-caffe-python-layers.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-extending-model-optimizer-with-caffe-python-layers.rst deleted file mode 100644 index 4277f68139845b..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-extending-model-optimizer-with-caffe-python-layers.rst +++ /dev/null @@ -1,110 +0,0 @@ -[LEGACY] Extending Model Optimizer with Caffe Python Layers -============================================================ - -.. meta:: - :description: Learn how to extract operator attributes in Model Optimizer to - support a custom Caffe operation written only in Python. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated TensorFlow conversion method. The guide on the new and recommended method, using a new frontend, can be found in the :doc:`Frontend Extensions <../../../openvino-extensibility/frontend-extensions>` article. - -This article provides instructions on how to support a custom Caffe operation written only in Python. For example, the -`Faster-R-CNN model `__ implemented in -Caffe contains a custom proposal layer written in Python. The layer is described in the -`Faster-R-CNN prototxt `__ in the following way: - -.. code-block:: sh - - layer { - name: 'proposal' - type: 'Python' - bottom: 'rpn_cls_prob_reshape' - bottom: 'rpn_bbox_pred' - bottom: 'im_info' - top: 'rois' - python_param { - module: 'rpn.proposal_layer' - layer: 'ProposalLayer' - param_str: "'feat_stride': 16" - } - } - - -This article describes only a procedure on how to extract operator attributes in Model Optimizer. The rest of the -operation enabling pipeline and information on how to support other Caffe operations (written in C++) is described in -the :doc:`Customize Model Optimizer <../legacy-model-optimizer-extensibility>` guide. - -======================================== -Writing Extractor for Caffe Python Layer -======================================== - -Custom Caffe Python layers have an attribute ``type`` (defining the type of the operation) equal to ``Python`` and two -mandatory attributes ``module`` and ``layer`` in the ``python_param`` dictionary. The ``module`` defines the Python module name -with the layer implementation, while ``layer`` value is an operation type defined by a user. In order to extract -attributes for such an operation it is necessary to implement extractor class inherited from the -``CaffePythonFrontExtractorOp`` class instead of ``FrontExtractorOp`` class, used for standard framework layers. The ``op`` -class attribute value should be set to the ``module + "." + layer`` value so the extractor is triggered for this kind of -operation. - -Below is a simplified example of the extractor for the custom operation Proposal from the mentioned Faster-R-CNN model. -The full code with additional checks can be found `here `__. - -The sample code uses operation ``ProposalOp`` which corresponds to ``Proposal`` operation described in the :doc:`Available Operations Sets <../../../openvino-ir-format/operation-sets/available-opsets>` -page. For a detailed explanation of the extractor, refer to the source code below. - -.. code-block:: py - :force: - - from openvino.tools.mo.ops.proposal import ProposalOp - from openvino.tools.mo.front.extractor import CaffePythonFrontExtractorOp - - - class ProposalPythonFrontExtractor(CaffePythonFrontExtractorOp): - op = 'rpn.proposal_layer.ProposalLayer' # module + "." + layer - enabled = True # extractor is enabled - - @staticmethod - def extract_proposal_params(node, defaults): - param = node.pb.python_param # get the protobuf message representation of the layer attributes - # parse attributes from the layer protobuf message to a Python dictionary - attrs = CaffePythonFrontExtractorOp.parse_param_str(param.param_str) - update_attrs = defaults - - # the operation expects ratio and scale values to be called "ratio" and "scale" while Caffe uses different names - if 'ratios' in attrs: - attrs['ratio'] = attrs['ratios'] - del attrs['ratios'] - if 'scales' in attrs: - attrs['scale'] = attrs['scales'] - del attrs['scales'] - - update_attrs.update(attrs) - ProposalOp.update_node_stat(node, update_attrs) # update the node attributes - - @classmethod - def extract(cls, node): - # define default values for the Proposal layer attributes - defaults = { - 'feat_stride': 16, - 'base_size': 16, - 'min_size': 16, - 'ratio': [0.5, 1, 2], - 'scale': [8, 16, 32], - 'pre_nms_topn': 6000, - 'post_nms_topn': 300, - 'nms_thresh': 0.7 - } - cls.extract_proposal_params(node, defaults) - return cls.enabled - -==================== -Additional Resources -==================== - -* :doc:`Model Optimizer Extensibility <../legacy-model-optimizer-extensibility>` -* :doc:`Graph Traversal and Modification Using Ports and Connections <[legacy]-graph-traversal-and-modification>` -* :doc:`Model Optimizer Extensions <[legacy]-model-optimizer-extensions>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst deleted file mode 100644 index 55b55a77335f2b..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification.rst +++ /dev/null @@ -1,186 +0,0 @@ -[LEGACY] Graph Traversal and Modification -=========================================== - -.. meta:: - :description: Learn about deprecated APIs and the Port and Connection classes - in Model Optimizer used for graph traversal and transformation. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated TensorFlow conversion method. The guide on the new and recommended method, using a new frontend, can be found in the :doc:`Frontend Extensions <../../../openvino-extensibility/frontend-extensions>` article. - -There are three APIs for a graph traversal and transformation used in the Model Optimizer: - -1. The API provided with the ``networkx`` Python library for the ``networkx.MultiDiGraph`` class, which is the base class for -the ``mo.graph.graph.Graph`` object. For example, the following methods belong to this API level: - -* ``graph.add_edges_from([list])``, -* ``graph.add_node(x, attrs)``, -* ``graph.out_edges(node_id)`` -* other methods where ``graph`` is a an instance of the ``networkx.MultiDiGraph`` class. - -**This is the lowest-level API. Avoid using it in the Model Optimizer transformations**. For more details, refer to the :ref:`Model Representation in Memory ` section. - -2. The API built around the ``mo.graph.graph.Node`` class. The ``Node`` class is the primary class to work with graph nodes -and their attributes. Examples of such methods and functions are: - -* ``node.in_node(y)``, -* ``node.out_node(x)``, -* ``node.get_outputs()``, -* ``node.insert_node_after(n1, y)``, -* ``create_edge(n1, n2)`` - -**There are some "Node" class methods not recommended for use and some functions defined in the mo.graph.graph have been deprecated**. For more details, refer to the ``mo/graph/graph.py`` file. - -3. The high-level API called Model Optimizer Graph API, which uses ``mo.graph.graph.Graph``, ``mo.graph.port.Port`` and -``mo.graph.connection.Connection`` classes. For example, the following methods belong to this API level: - -* ``node.in_port(x)``, -* ``node.out_port(y)``, -* ``port.get_connection()``, -* ``connection.get_source()``, -* ``connection.set_destination(dest_port)`` - -**This is the recommended API for the Model Optimizer transformations and operations implementation**. - -The main benefit of using the Model Optimizer Graph API is that it hides some internal implementation details (the fact that -the graph contains data nodes), provides API to perform safe and predictable graph manipulations, and adds operation -semantic to the graph. This is achieved with introduction of concepts of ports and connections. - -.. note:: - This article is dedicated to the Model Optimizer Graph API only and does not cover other two non-recommended APIs. - -.. _mo_intro_ports: - -===== -Ports -===== - -An operation semantic describes how many inputs and outputs the operation has. For example, -:doc:`Parameter <../../../openvino-ir-format/operation-sets/operation-specs/infrastructure/parameter-1>` and :doc:`Const <../../../openvino-ir-format/operation-sets/operation-specs/infrastructure/constant-1>` operations have no -inputs and have one output, :doc:`ReLU <../../../openvino-ir-format/operation-sets/operation-specs/activation/relu-1>` operation has one input and one output, -:doc:`Split <../../../openvino-ir-format/operation-sets/operation-specs/movement/split-1>` operation has 2 inputs and a variable number of outputs depending on the value of the -attribute ``num_splits``. - -Each operation node in the graph (an instance of the ``Node`` class) has 0 or more input and output ports (instances of -the ``mo.graph.port.Port`` class). The ``Port`` object has several attributes: - -* ``node`` - the instance of the ``Node`` object the port belongs to. -* ``idx`` - the port number. Input and output ports are numbered independently, starting from ``0``. Thus, - :doc:`ReLU <../../../openvino-ir-format/operation-sets/operation-specs/activation/relu-1>` operation has one input port (with index ``0``) and one output port (with index ``0``). -* ``type`` - the type of the port. Could be equal to either ``"in"`` or ``"out"``. -* ``data`` - the object that should be used to get attributes of the corresponding data node. This object has methods ``get_shape()`` / ``set_shape()`` and ``get_value()`` / ``set_value()`` to get/set shape/value of the corresponding data node. For example, ``in_port.data.get_shape()`` returns an input shape of a tensor connected to input port ``in_port`` (``in_port.type == 'in'``), ``out_port.data.get_value()`` returns a value of a tensor produced from output port ``out_port`` (``out_port.type == 'out'``). - -.. note:: - Functions ``get_shape()`` and ``get_value()`` return ``None`` until the partial inference phase. For more information about model conversion phases, refer to the :ref:`Model Conversion Pipeline `. For information about partial inference phase, see the :ref:`Partial Inference `. - -There are several methods of the ``Node`` class to get the instance of a corresponding port: - -* ``in_port(x)`` and ``out_port(x)`` to get the input/output port with number ``x``. -* ``in_ports()`` and ``out_ports()`` to get a dictionary, where key is a port number and the value is the corresponding input/output port. - -Attributes ``in_ports_count`` and ``out_ports_count`` of the ``Op`` class instance define default number of input and output -ports to be created for the ``Node``. However, additional input/output ports can be added using methods -``add_input_port()`` and ``add_output_port()``. Port also can be removed, using the ``delete_input_port()`` and -``delete_output_port()`` methods. - -The ``Port`` class is just an abstraction that works with edges incoming/outgoing to/from a specific ``Node`` instance. For -example, output port with ``idx = 1`` corresponds to the outgoing edge of a node with an attribute ``out = 1``, the input -port with ``idx = 2`` corresponds to the incoming edge of a node with an attribute ``in = 2``. - -Consider the example of a graph part with 4 operation nodes "Op1", "Op2", "Op3", and "Op4" and a number of data nodes -depicted with light green boxes. - -.. image:: ../../../../assets/images/MO_ports_example_1.svg - :scale: 80 % - :align: center - -Operation nodes have input ports (yellow squares) and output ports (light purple squares). Input port may not be -connected. For example, the input **port 2** of node **Op1** does not have incoming edge, while output port always has an -associated data node (after the partial inference when the data nodes are added to the graph), which may have no -consumers. - -Ports can be used to traverse a graph. The method ``get_source()`` of an input port returns an output port producing the -tensor consumed by the input port. It is important that the method works the same during front, middle and back phases of a -model conversion even though the graph structure changes (there are no data nodes in the graph during the front phase). - -Let's assume that there are 4 instances of ``Node`` object ``op1, op2, op3``, and ``op4`` corresponding to nodes **Op1**, **Op2**, -**Op3**, and **Op4**, respectively. The result of ``op2.in_port(0).get_source()`` and ``op4.in_port(1).get_source()`` is the -same object ``op1.out_port(1)`` of type ``Port``. - -The method ``get_destination()`` of an output port returns the input port of the node consuming this tensor. If there are -multiple consumers of this tensor, the error is raised. The method ``get_destinations()`` of an output port returns a -list of input ports consuming the tensor. - -The method ``disconnect()`` removes a node incoming edge corresponding to the specific input port. The method removes -several edges if it is applied during the front phase for a node output port connected with multiple nodes. - -The method ``port.connect(another_port)`` connects output port ``port`` and input port ``another_port``. The method handles -situations when the graph contains data nodes (middle and back phases) and does not create an edge between two nodes -but also automatically creates data node or reuses existing data node. If the method is used during the front phase and -data nodes do not exist, the method creates edge and properly sets ``in`` and ``out`` edge attributes. - -For example, applying the following two methods to the graph above will result in the graph depicted below: - -.. code-block:: py - :force: - - op4.in_port(1).disconnect() - op3.out_port(0).connect(op4.in_port(1)) - -.. image:: ../../../../assets/images/MO_ports_example_2.svg - :scale: 80 % - :align: center - -.. note:: - For a full list of available methods, refer to the ``Node`` class implementation in the ``mo/graph/graph.py`` and ``Port`` class implementation in the ``mo/graph/port.py`` files. - -=========== -Connections -=========== - -Connection is a concept introduced to easily and reliably perform graph modifications. Connection corresponds to a -link between a source output port with one or more destination input ports or a link between a destination input port -and source output port producing data. So each port is connected with one or more ports with help of a connection. -Model Optimizer uses the ``mo.graph.connection.Connection`` class to represent a connection. - -There is only one ``get_connection()`` method of the ``Port`` class to get the instance of the corresponding ``Connection`` -object. If the port is not connected, the returned value is ``None``. - -For example, the ``op3.out_port(0).get_connection()`` method returns a ``Connection`` object encapsulating edges from node -**Op3** to data node **data_3_0** and two edges from data node **data_3_0** to two ports of the node **Op4**. - -The ``Connection`` class provides methods to get source and destination(s) ports the connection corresponds to: - -* ``connection.get_source()`` - returns an output ``Port`` object producing the tensor. -* ``connection.get_destinations()`` - returns a list of input ``Port`` consuming the data. -* ``connection.get_destination()`` - returns a single input ``Port`` consuming the data. If there are multiple consumers, the exception is raised. - -The ``Connection`` class provides methods to modify a graph by changing a source or destination(s) of a connection. For -example, the function call ``op3.out_port(0).get_connection().set_source(op1.out_port(0))`` changes source port of edges -consuming data from port ``op3.out_port(0)`` to ``op1.out_port(0)``. The transformed graph from the sample above is depicted -below: - -.. image:: ../../../../assets/images/MO_connection_example_1.svg - :scale: 80 % - :align: center - -Another example is the ``connection.set_destination(dest_port)`` method. It disconnects ``dest_port`` and all input ports to which -the connection is currently connected and connects the connection source port to ``dest_port``. - -Note that connection works seamlessly during front, middle, and back phases and hides the fact that the graph structure is -different. - -.. note:: - For a full list of available methods, refer to the ``Connection`` class implementation in the ``mo/graph/connection.py`` file. - -==================== -Additional Resources -==================== - -* :doc:`Model Optimizer Extensibility <../legacy-model-optimizer-extensibility>` -* :doc:`Model Optimizer Extensions <[legacy]-model-optimizer-extensions>` -* :doc:`Extending Model Optimizer with Caffe Python Layers <[legacy]-extending-model-optimizer-with-caffe-python-layers>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions.rst deleted file mode 100644 index db252965cb84e9..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions.rst +++ /dev/null @@ -1,60 +0,0 @@ -[LEGACY] Model Optimizer Extensions -===================================== - -.. meta:: - :description: Learn about deprecated extensions, which enable injecting logic - to the model conversion pipeline without changing the Model - Optimizer core code. - -.. toctree:: - :maxdepth: 1 - :hidden: - - [legacy]-model-optimizer-extensions/[legacy]-model-optimizer-operation - [legacy]-model-optimizer-extensions/[legacy]-optimizer-extractor - [legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated TensorFlow conversion method. The guide on the new and recommended method, using a new frontend, can be found in the :doc:`Frontend Extensions <../../../openvino-extensibility/frontend-extensions>` article. - -Model Optimizer extensions enable you to inject some logic to the model conversion pipeline without changing the Model -Optimizer core code. There are three types of the Model Optimizer extensions: - -1. :doc:`Model Optimizer operation <[legacy]-model-optimizer-extensions/[legacy]-model-optimizer-operation>`. -2. A :doc:`framework operation extractor <[legacy]-model-optimizer-extensions/[legacy]-optimizer-extractor>`. -3. A :doc:`model transformation <[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions>`, which can be executed during front, middle or back phase of the model conversion. - -An extension is just a plain text file with a Python code. The file should contain a class (or classes) inherited from -one of extension base classes. Extension files should be saved to a directory with the following structure: - -.. code-block:: sh - - .// - ops/ - custom operations - front/ - framework independent front transformations - / - front transformations for models only and extractors for operations - / - front transformations for models only and extractors for operations - ... - middle/ - middle transformations - back/ - back transformations - -Model Optimizer uses the same layout internally to keep built-in extensions. The only exception is that the -``mo/ops/`` directory is also used as a source of the Model Optimizer operations due to historical reasons. - -.. note:: - The name of a root directory with extensions should not be equal to "extensions" because it will result in a name conflict with the built-in Model Optimizer extensions. - -.. note:: - Model Optimizer itself is built by using these extensions, so there is a huge number of examples of their usage in the Model Optimizer code. - -==================== -Additional Resources -==================== - -* :doc:`Model Optimizer Extensibility <../legacy-model-optimizer-extensibility>` -* :doc:`Graph Traversal and Modification Using Ports and Connections <[legacy]-graph-traversal-and-modification>` -* :doc:`Extending Model Optimizer with Caffe Python Layers <[legacy]-extending-model-optimizer-with-caffe-python-layers>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst deleted file mode 100644 index 95f722ee063443..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-graph-transformation-extensions.rst +++ /dev/null @@ -1,605 +0,0 @@ -[LEGACY] Graph Transformation Extensions -========================================== - -.. meta:: - :description: Learn about various base classes for front, middle and back phase - transformations applied during model conversion with Model Optimizer. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated TensorFlow conversion method. The guide on the new and recommended method, using a new frontend, can be found in the :doc:`Frontend Extensions <../../../../openvino-extensibility/frontend-extensions>` article. - -Model Optimizer provides various base classes to implement :ref:`Front Phase Transformations `, -:ref:`Middle Phase Transformations `, and :ref:`Back Phase Transformations `. -All classes have the following common class attributes and methods: - -1. The ``enabled`` attribute specifies whether the transformation is enabled or not. The value can be changed during runtime to enable or disable execution of the transformation during a model conversion. Default value is ``True``. -2. The ``id`` attribute specifies a unique transformation string identifier. This transformation identifier can be used to enable (disable) the transformation by setting environment variable ``MO_ENABLED_TRANSFORMS`` (``MO_DISABLED_TRANSFORMS``) with a comma separated list of ``ids``. The environment variables override the value of the ``enabled`` attribute of the transformation. Instead of using ``id`` attribute value you can add fully defined class name to ``MO_ENABLED_TRANSFORMS`` (``MO_DISABLED_TRANSFORMS``) variable, ``extensions.back.NonmalizeToNormalizeL2.NormalizeToNormalizeL2`` for example. It is an optional attribute. -3. The ``run_not_recursively`` attribute specifies whether the transformation should be executed in the sub-graphs, for example, body of the :doc:`TensorIterator <../../../../openvino-ir-format/operation-sets/operation-specs/infrastructure/tensor-iterator-1>` and the :doc:`Loop <../../../../openvino-ir-format/operation-sets/operation-specs/infrastructure/loop-5>`. Default value is ``True``. -4. The ``force_clean_up`` attribute specifies whether the graph clean up should be executed after the transformation. The graph cleanup removes nodes of the graph not reachable from the model inputs. Default value is ``False``. -5. The ``force_shape_inference`` attribute specifies whether the nodes marked with ``need_shape_inference`` attribute equal to ``True`` should be re-inferred after the transformation. Model Optimizer sets this attribute automatically for nodes, input(s) of which were changed during the transformation, or you can set this attribute manually in the transformation for the specific nodes. Default value is ``False``. -6. Attribute ``graph_condition`` specifies a list of functions with one parameter -- ``Graph`` object. The transformation is executed if and only if all functions return ``True``. If the attribute is not set, no check is performed. -7. Method ``run_before()`` returns a list of transformation classes which this transformation should be executed before. -8. Method ``run_after()`` returns a list of transformation classes which this transformation should be executed after. - -.. note:: - Some of the transformation types have specific class attributes and methods, which are explained in the corresponding sections of this document. - -Model Optimizer builds a graph of dependencies between registered transformations and executes them in the topological -order. To execute the transformation during a proper model conversion phase, Model Optimizer defines several -anchor transformations that do nothing. All transformations are ordered with respect to these anchor transformations. -The diagram below shows anchor transformations, some of built-in transformations and dependencies between them: - -.. image:: ../../../../../assets/images/MO_transformations_graph.svg - -User-defined transformations are executed after the corresponding ``Start`` and before the corresponding ``Finish`` anchor -transformations by default (if ``run_before()`` and ``run_after()`` methods have not been overridden). - -.. note:: - The ``PreMiddleStart`` and ``PostMiddleStart`` anchors were introduced due to historical reasons to refactor the Model Optimizer pipeline, which initially had a hardcoded order of transformations. - -.. _mo_front_phase_transformations: - -=========================== -Front Phase Transformations -=========================== - -There are several types of a front phase transformation: - -1. :ref:`Pattern-Defined Front Phase Transformations ` triggered for each sub-graph of the original graph isomorphic to the specified pattern. -2. :ref:`Specific Operation Front Phase Transformations ` triggered for the node with a specific ``op`` attribute value. -3. :ref:`Generic Front Phase Transformations `. -4. Manually enabled transformation, defined with a JSON configuration file (for TensorFlow, ONNX, and PaddlePaddle models), specified using the ``--transformations_config`` command-line parameter: - - 1. :ref:`Node Name Pattern Front Phase Transformations `. - 2. :ref:`Front Phase Transformations Using Start and End Points `. - 3. :ref:`Generic Front Phase Transformations Enabled with Transformations Configuration File `. - -.. _pattern_defined_front_phase_transformations: - -Pattern-Defined Front Phase Transformations -########################################### - -This type of transformation is implemented using ``mo.front.common.replacement.FrontReplacementSubgraph`` and -``mo.front.common.replacement.FrontReplacementPattern`` as base classes and works as follows: - -1. Define a sub-graph to be matched, using a list of nodes with attributes and edges connecting them (edges may also have attributes). -2. Model Optimizer searches for all sub-graphs of the original graph, isomorphic to the specified sub-graph (pattern). -3. Model Optimizer executes the defined function performing graph transformation for each instance of a matched sub-graph. You can override different functions in the base transformation class so the Model Optimizer works differently: - - 1. The ``replace_sub_graph(self, graph, match)`` override the method. In this case Model Optimizer only executes the overridden function, pass the ``graph`` object and a dictionary describing the matched sub-graph. You are required to write the transformation and connect the newly created nodes to the rest of the graph. - 2. The ``generate_sub_graph(self, graph, match)`` override the method. This case is not recommended for use because it is the most complicated approach. It can be effectively replaced with one of two previous approaches. - -The sub-graph pattern is defined in the ``pattern()`` function. This function should return a dictionary with two keys: -``nodes`` and ``edges``: - -* The value for the ``nodes`` key is a list of tuples with two elements. - - * The first element is an alias name for a node that will be used to define edges between nodes and in the transformation function. - * The second element is a dictionary with attributes. The key is a name of an attribute that should exist in the node. The value for the attribute can be some specific value to match or a function that gets a single parameter - the attribute value from the node. The function should return the result of attribute comparison with a dedicated value. - -* The value for the ``edges`` key is a list of tuples with two or three elements. - - * The first element is the alias name of the node producing a tensor. - * The second element is the alias name of the node consuming the tensor. - * The third element (optional) is the dictionary with expected edge attributes. This dictionary usually contains attributes like ``in`` and ``out``, defining input and output ports. - -Consider the example of a front transformation implemented in the ``extensions/front/Mish_fusion.py`` file performing -fusing of the sub-graph defining the :doc:`Mish <../../../../openvino-ir-format/operation-sets/operation-specs/activation/mish-4>` activation function into a single -operation: - -.. code-block:: py - :force: - - from openvino.tools.mo.front.Softplus_fusion import SoftplusFusion - from openvino.tools.mo.ops.activation_ops import Mish - from openvino.tools.mo.front.common.replacement import FrontReplacementSubgraph - from openvino.tools.mo.front.subgraph_matcher import SubgraphMatch - from openvino.tools.mo.graph.graph import Graph, rename_nodes - - - class MishFusion(FrontReplacementSubgraph): - """ - The transformation looks for the pattern with Softplus defining the Mish function: Mish(x) = x * tanh(SoftPlus(x)). - """ - enabled = True # Transformation is enabled. - - def run_after(self): # Run this transformation after "SoftplusFusion" transformation. - return [SoftplusFusion] - - def pattern(self): # Define pattern according to formulae x * tanh(SoftPlus(x)). - return dict( - nodes=[ - ('mul', dict(op='Mul')), - ('tanh', dict(op='Tanh')), - ('softplus', dict(op='SoftPlus')), - ], - edges=[ - ('softplus', 'tanh'), - ('tanh', 'mul'), - ]) - - def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]): # Entry point for the transformation. - mul = match['mul'] # Get the Node corresponding to matched "mul" node. - mul_name = mul.soft_get('name', mul.id) - softplus = match['softplus'] # Get the Node corresponding to the matched "softplus" node. - - # Determine the input port of Mul which gets the 'input' node output. - input_port_idx = int(mul.in_port(0).get_connection().get_source().node.soft_get('op') == 'Tanh') - - # Check that the same tensor is provided as input to Mul and SoftPlus. - if mul.in_port(input_port_idx).get_source() != softplus.in_port(0).get_source(): - return - - mish = Mish(graph, {}).create_node() # Create Mish operation. - mish.in_port(0).connect(mul.in_port(input_port_idx).get_source()) # Connect input to the Mish. - mul.out_port(0).get_connection().set_source(mish.out_port(0)) # Reconnect outgoing edge from "mul" to Mish. - - # Rename the created Mish operation to have the name of the "mul" node, which produced the value equal to the - # Mish output. - rename_nodes([(mul, mul_name + '/TBR'), (mish, mul_name)]) - -.. _specific_operation_front_phase_transformations: - -Specific Operation Front Phase Transformations -############################################## - -This type of transformation is implemented using ``mo.front.common.replacement.FrontReplacementOp`` as base class and -works as follows: - -1. Define an operation type to trigger the transformation. -2. Model Optimizer searches for all nodes in the graph with the attribute ``op`` equal to the specified value. -3. Model Optimizer executes the defined function performing graph transformation for each instance of a matched node. You can override different functions in the base transformation class and Model Optimizer works differently: - - 1. The ``replace_sub_graph(self, graph, match)`` override method. In this case, Model Optimizer only executes the overridden function. Pass the ``graph`` object and a dictionary with a single key ``op`` with the matched node as value. You are required to write the transformation and connect the newly created nodes to the rest of the graph. - 2. The ``replace_op(self, graph, node)`` override method. In this case, Model Optimizer executes the overridden function. Pass the ``graph`` object and the matched node as ``node`` parameter. If the function returns an ``id`` of some node, then the ``Node`` with this ``id`` is connected to the consumers of the matched node. After applying the transformation, the matched node is removed from the graph. - -The ``FrontReplacementOp`` class provides a simpler mechanism to match a single operation with specific value of the ``op`` -(write the ``op`` attribute in the class instead of defining a ``pattern()`` function) attribute and perform the -transformation. - -Consider an example transformation from the ``extensions/front/Pack.py`` file, which replaces ``Pack`` operation from -the TensorFlow: - -.. code-block:: py - :force: - - from openvino.tools.mo.front.common.partial_infer.utils import int64_array - from openvino.tools.mo.front.common.replacement import FrontReplacementOp - from openvino.tools.mo.front.tf.graph_utils import create_op_with_const_inputs - from openvino.tools.mo.graph.graph import Node, Graph, rename_nodes - from openvino.tools.mo.ops.concat import Concat - from openvino.tools.mo.ops.unsqueeze import Unsqueeze - - - class Pack(FrontReplacementOp): - op = "Pack" # Trigger transformation for all nodes in the graph with the op = "Pack" attribute - enabled = True # Transformation is enabled. - - def replace_op(self, graph: Graph, node: Node): # Entry point for the transformation. - # Create a Concat operation with a number of inputs equal to a number of inputs to Pack. - out_node = Concat(graph, {'axis': node.axis, 'in_ports_count': len(node.in_ports())}).create_node() - pack_name = node.soft_get('name', node.id) - - for ind in node.in_ports(): - # Add dimension of size 1 to all inputs of the Pack operation and add them as Concat inputs. - unsqueeze_node = create_op_with_const_inputs(graph, Unsqueeze, {1: int64_array([node.axis])}, - {'name': node.soft_get('name', node.id) + '/Unsqueeze'}) - node.in_port(ind).get_connection().set_destination(unsqueeze_node.in_port(0)) - unsqueeze_node.out_port(0).connect(out_node.in_port(ind)) - - # Rename the created Concat operation to have the name of the "pack" node, which produced the value equal to the - # Concat output. - rename_nodes([(node, pack_name + '/TBR'), (out_node, pack_name)]) - return [out_node.id] # Reconnect the Pack operation consumers to get input from Concat instead. - - -.. _generic_front_phase_transformations: - -Generic Front Phase Transformations -################################### - -Model Optimizer provides a mechanism to implement generic front phase transformation. This type of transformation is -implemented using ``mo.front.common.replacement.FrontReplacementSubgraph`` or -``mo.front.common.replacement.FrontReplacementPattern`` as base classes. Make sure the transformation is enabled before trying to execute it. -Then, Model Optimizer executes the ``find_and_replace_pattern(self, graph)`` method and -provides a ``Graph`` object as an input. - -Consider the example of a generic front transformation from the ``extensions/front/SqueezeNormalize.py`` file performing -normalization of the :doc:`Squeeze <../../../../openvino-ir-format/operation-sets/operation-specs/shape/squeeze-1>` operation. Older version of the operation had a list of -axes to squeeze as an attribute, but now it is a separate input. For backward compatibility, the Model Optimizer -operation supports both semantics. Before IR generation, however, the operation should be normalized according to the -specification. - -.. code-block:: py - :force: - - import logging as log - - from openvino.tools.mo.front.common.partial_infer.utils import int64_array - from openvino.tools.mo.front.common.replacement import FrontReplacementPattern - from openvino.tools.mo.graph.graph import Graph - from openvino.tools.mo.ops.const import Const - from openvino.tools.mo.utils.error import Error - - - class SqueezeNormalize(FrontReplacementPattern): - """ - Normalizes inputs of the Squeeze layers. The layers should have two inputs: the input with data and input with the - dimensions to squeeze. If the second input is omitted then all dimensions of size 1 should be removed. - """ - enabled = True # The transformation is enabled. - - def find_and_replace_pattern(self, graph: Graph): # The function is called unconditionally. - for squeeze_node in graph.get_op_nodes(op='Squeeze'): # Iterate over all nodes with op='Squeeze'. - # If the operation has only 1 input node and no 'squeeze_dims' Node attribute, then convert the attribute to - # the operation input. - if len(squeeze_node.in_nodes()) == 1 and squeeze_node.has_valid('squeeze_dims'): - dims_node = Const(graph, {'name': squeeze_node.id + '/Dims', - 'value': int64_array(squeeze_node.squeeze_dims)}).create_node() - squeeze_node.in_port(1).connect(dims_node.out_port(0)) - del squeeze_node['squeeze_dims'] - # If two inputs already exist, that means the operation is already normalized. - elif len(squeeze_node.in_nodes()) == 2: - log.debug('The Squeeze node "{}" is already normalized'.format(squeeze_node.name)) - # In all other cases, raise an error. - else: - raise Error('The Squeeze layer "{}" should either have 2 inputs or one input and an "squeeze_dims" ' - 'attribute'.format(squeeze_node.soft_get('name'))) - -For the details on implementation and how these front phase transformations work, refer to the ``mo/front/common/replacement.py`` -file. - -.. _node_name_pattern_front_phase_transformations: - -Node Name Pattern Front Phase Transformations -############################################# - -TensorFlow uses a mechanism of scope to group related operation nodes. It is a good practice to put nodes performing -particular task into the same scope. This approach divides a graph into logical blocks that are easier to review in the -TensorBoard. The scope, in fact, just defines a common name prefix for the nodes belonging to it. - -For example, Inception topologies contain several types of so-called **Inception blocks**. Some of them are equal to each -other, but located in different places of the network. For example, Inception V4 from the -`TensorFlow-Slim image classification model library `__ has -``Mixed_5b``, ``Mixed_5c`` and ``Mixed_5d`` inception blocks with exactly the same nodes, with the same set of attributes. - -Consider a situation when these Inception blocks are implemented extremely efficiently using a single Inference -Engine operation called ``InceptionBlock`` and these blocks in the model need to be replaced with instances of this operation. -Model Optimizer provides mechanism to trigger the transformation for a sub-graph of operations defined by the node name -regular expressions (scope). In this particular case, some of the patterns are: ``.*InceptionV4/Mixed_5b``, -``.*InceptionV4/Mixed_5c`` and ``.*InceptionV4/Mixed_5d``. Each pattern starts with ``.*``, because the ``InceptionV4`` prefix -is added to all nodes names during a model freeze. - -This type of transformation is implemented using ``mo.front.tf.replacement.FrontReplacementFromConfigFileSubGraph`` as a -base class and works as follows: - -1. Prepare a JSON configuration file template defining node names patterns. -2. Run Model Optimizer with the ``--tensorflow_custom_operations_config_update`` command-line parameter, and Model Optimizer adds information about input and output nodes of the specified sub-graphs. -3. Model Optimizer executes the defined transformation **only** when you specify the path to the configuration file updated in step 2 using the ``--transformations_config`` command-line parameter. - -Consider the following possible configuration file template for the Inception Block transformation: - -.. code-block:: json - - [ - { - "custom_attributes": { - "attr1_key": "attr1_value", - "attr2_key": 123456 - }, - "id": "InceptionBlockTransformation", - "instances": [ - ".*InceptionV4/Mixed_5b", - ".*InceptionV4/Mixed_5c", - ".*InceptionV4/Mixed_5d" - ], - "match_kind": "scope" - } - ] - -The configuration file contains a list of dictionaries. Each dictionary defines one transformation. Each transformation -is defined with several parameters: - -* ``id`` - **(Mandatory)** — is a unique identifier of the transformation. It is used in the Python code that implements the transformation to link the class and the transformation description from the configuration file. -* ``match_kind`` - **(Mandatory)** — is a string that specifies the matching algorithm. For the node name pattern case, the value should be equal to ``scope``. Another possible values are described in the dedicated sections below. -* ``instances`` - **(Mandatory)** — specifies instances of the sub-graph to be matched. It contains a list of node names prefixes patterns for the match kind of the ``scope`` type. -* ``custom_attributes`` - **(Optional)** — is a dictionary with attributes that can be used in the transformation code. - -After running Model Optimizer with additional ``--tensorflow_custom_operations_config_update`` parameter pointing to -the template configuration file, the content of the file should be updated with two new sections ``inputs`` and ``outputs``. -The file content after the update is as follows: - -.. code-block:: json - - [ - { - "id": "InceptionBlockTransformation", - "custom_attributes": { - "attr1_key": "attr1_value", - "attr2_key": 123456 - }, - "instances": [ - ".*InceptionV4/Mixed_5b", - ".*InceptionV4/Mixed_5c", - ".*InceptionV4/Mixed_5d" - ], - "match_kind": "scope", - "inputs": [ - [ - { - "node": "Branch_2/Conv2d_0a_1x1/Conv2D$", - "port": 0 - }, - { - "node": "Branch_3/AvgPool_0a_3x3/AvgPool$", - "port": 0 - }, - { - "node": "Branch_1/Conv2d_0a_1x1/Conv2D$", - "port": 0 - }, - { - "node": "Branch_0/Conv2d_0a_1x1/Conv2D$", - "port": 0 - } - ] - ], - "outputs": [ - { - "node": "concat$", - "port": 0 - } - ] - } - ] - -The value for ``inputs`` key is a list of lists describing input tensors of the sub-graph. Each element of the top-level -list corresponds to one unique input tensor of the sub-graph. Each internal list describes a list of nodes consuming -this tensor and port numbers, where the tensor is consumed. Model Optimizer generates regular expressions for the input -nodes names to uniquely identify them in each instance of the sub-graph, defined by the ``instances``. Denote these nodes -as input nodes of the sub-graph. - -In the InceptionV4 topology, the ``InceptionV4/Mixed_5b`` block has four input tensors from outside of the sub-graph, -but all of them are produced by the ``InceptionV4/Mixed_5a/concat`` node. Therefore, the top-level list of the ``inputs`` -contains one list corresponding to this tensor. Four input nodes of the sub-graph consume the tensor produced by -``InceptionV4/Mixed_5a/concat`` node. In this case, all four input nodes consume input tensor into "port 0". - -The order of items in the internal list describing nodes does not matter, but the order of elements in the top-level -list is important. This order defines how Model Optimizer attaches input tensors to a new generated -node if the sub-graph is replaced with a single node. The ``i``-th input node of the sub-graph is obtained using -``match.single_input_node(i)`` call in the sub-graph transformation code. More information about API is given below. If it is -necessary to change the order of input tensors, the configuration file can be edited in the text editor. - -The value for the ``outputs`` key is a list describing nodes of the sub-graph producing tensor, that goes outside of the -sub-graph or does not have child nodes. Denote these nodes as output nodes of the sub-graph. The order of elements in -the list is important. The ``i``-th element of the list describes the ``i``-th output tensor of the sub-graph, which could be -obtained using ``match.output_node(i)`` call. The order of elements can be manually changed in the configuration file. -Model Optimizer uses this order to connect output edges if the sub-graph is replaced with a single node. - -For more examples of this type of transformation, refer to the :doc:`Converting TensorFlow Object Detection API Models <../../legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-object-detection>` guide. - -.. _start_end_points_front_phase_transformations: - -Front Phase Transformations Using Start and End Points -###################################################### - -This type of transformation is implemented using ``mo.front.tf.replacement.FrontReplacementFromConfigFileSubGraph`` as a -base class and works as follows: - -1. Prepare a JSON configuration file that defines the sub-graph to match, using two lists of node names: "start" and "end" nodes. -2. Model Optimizer executes the defined transformation **only** when you specify the path to the configuration file using the ``--transformations_config`` command-line parameter . Model Optimizer performs the following steps to match the sub-graph: - - 1. Starts a graph traversal from every start node following the direction of the graph edges. The search stops in an end node or in the case of a node without consumers. All visited nodes are added to the matched sub-graph. - 2. Starts another graph traversal from each non-start node of the sub-graph, i.e. every node except nodes from the "start" list. In this step, the edges are traversed in the opposite edge direction. All newly visited nodes are added to the matched sub-graph. This step is needed to add nodes required for calculation values of internal nodes of the matched sub-graph. - 3. Checks that all "end" nodes were reached from "start" nodes. If not, it exits with an error. - 4. Checks that there are no :doc:`Parameter <../../../../openvino-ir-format/operation-sets/operation-specs/infrastructure/parameter-1>` operations among added nodes. If they exist, the sub-graph depends on the inputs of the model. Such configuration is considered incorrect so Model Optimizer exits with an error. - -This algorithm finds all nodes "between" start and end nodes and nodes needed for calculation of non-input nodes of the -matched sub-graph. - -The example of a JSON configuration file for a transformation with start and end points is -``extensions/front/tf/ssd_support_api_v1.15.json``: - -.. code-block:: json - - [ - { - "custom_attributes": { - "code_type": "caffe.PriorBoxParameter.CENTER_SIZE", - "pad_mode": "caffe.ResizeParameter.CONSTANT", - "resize_mode": "caffe.ResizeParameter.WARP", - "clip_before_nms": false, - "clip_after_nms": true - }, - "id": "ObjectDetectionAPISSDPostprocessorReplacement", - "include_inputs_to_sub_graph": true, - "include_outputs_to_sub_graph": true, - "instances": { - "end_points": [ - "detection_boxes", - "detection_scores", - "num_detections" - ], - "start_points": [ - "Postprocessor/Shape", - "Postprocessor/scale_logits", - "Postprocessor/Tile", - "Postprocessor/Reshape_1", - "Postprocessor/Cast_1" - ] - }, - "match_kind": "points" - } - ] - -The format of the file is similar to the one provided as an example in the -:ref:`Node Name Pattern Front Phase Transformations ` section. The difference is in -the value of the ``match_kind`` parameter, which should be equal to the ``points`` and the format of the ``instances`` parameter, -which should be a dictionary with two keys ``start_points`` and ``end_points``, defining start and end node names -respectively. - -.. note:: - The ``include_inputs_to_sub_graph`` and ``include_outputs_to_sub_graph`` parameters are redundant and should be always equal to ``true``. - -.. note:: - This sub-graph match algorithm has a limitation that each start node must have only one input. Therefore, it is not possible to specify, for example, the :doc:`Convolution <../../../../openvino-ir-format/operation-sets/operation-specs/convolution/convolution-1>` node as input because it has two inputs: data tensor and tensor with weights. - -For other examples of transformations with points, refer to the -:doc:`Converting TensorFlow Object Detection API Models <../../legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-tensorflow-object-detection>` guide. - -.. _generic_transformations_config_front_phase_transformations: - -Generic Front Phase Transformations Enabled with Transformations Configuration File -################################################################################### - -This type of transformation works similarly to the :ref:`Generic Front Phase Transformations ` -but require a JSON configuration file to enable it similarly to -:ref:`Node Name Pattern Front Phase Transformations ` and -:ref:`Front Phase Transformations Using Start and End Points `. - -The base class for this type of transformation is -``mo.front.common.replacement.FrontReplacementFromConfigFileGeneral``. Model Optimizer executes the -``transform_graph(self, graph, replacement_descriptions)`` method and provides the ``Graph`` object and dictionary with values -parsed from the `custom_attributes` attribute of the provided JSON configuration file. - -The example of the configuration file for this type of transformation is ``extensions/front/tf/yolo_v1_tiny.json``: - -.. code-block:: json - - [ - { - "id": "TFYOLO", - "match_kind": "general", - "custom_attributes": { - "classes": 20, - "coords": 4, - "num": 2, - "do_softmax": 0 - } - } - ] - -and the corresponding transformation file is ``./extensions/front/YOLO.py``: - -.. code-block:: py - :force: - - from openvino.tools.mo.front.no_op_eraser import NoOpEraser - from openvino.tools.mo.front.standalone_const_eraser import StandaloneConstEraser - from openvino.tools.mo.ops.regionyolo import RegionYoloOp - from openvino.tools.mo.front.tf.replacement import FrontReplacementFromConfigFileGeneral - from openvino.tools.mo.graph.graph import Node, Graph - from openvino.tools.mo.ops.result import Result - from openvino.tools.mo.utils.error import Error - - - class YoloRegionAddon(FrontReplacementFromConfigFileGeneral): - """ - Replaces all Result nodes in graph with YoloRegion->Result nodes chain. - YoloRegion node attributes are taken from configuration file - """ - replacement_id = 'TFYOLO' # The identifier matching the "id" attribute in the JSON file. - - def run_after(self): - return [NoOpEraser, StandaloneConstEraser] - - def transform_graph(self, graph: Graph, replacement_descriptions): - op_outputs = [n for n, d in graph.nodes(data=True) if 'op' in d and d['op'] == 'Result'] - for op_output in op_outputs: - last_node = Node(graph, op_output).in_node(0) - op_params = dict(name=last_node.id + '/YoloRegion', axis=1, end_axis=-1) - op_params.update(replacement_descriptions) - region_layer = RegionYoloOp(graph, op_params) - region_layer_node = region_layer.create_node([last_node]) - # In here, 'axis' from 'dim_attrs' can be removed to avoid permutation from axis = 1 to axis = 2. - region_layer_node.dim_attrs.remove('axis') - Result(graph).create_node([region_layer_node]) - graph.remove_node(op_output) - -The configuration file has only 3 parameters: ``id`` identifier of the transformation , ``match_kind`` (which should be equal -to ``general``) and the ``custom_attributes`` dictionary with custom attributes accessible in the transformation. - -.. _mo_middle_phase_transformations: - -============================ -Middle Phase Transformations -============================ - -There are two types of middle phase transformations: - -1. :ref:`Pattern-Defined Middle Phase Transformations ` triggered for each sub-graph of the original graph, isomorphic to the specified pattern. -2. :ref:`Generic Middle Phase Transformations `. - -.. _pattern_defined_middle_phase_transformations: - -Pattern-Defined Middle Phase Transformations -############################################ - -This type of transformation is implemented using ``mo.middle.replacement.MiddleReplacementPattern`` as a base class and -works similarly to the :ref:`Pattern-Defined Middle Phase Transformations ` -The are two differences: - -1. The transformation entry function name is ``replace_pattern(self, graph, match)``. -2. The pattern defining the graph should contain data nodes because the structure of the graph is different between front and middle phases. For more information about the graph structure changes, refer to the :ref:`Partial Inference `. - -For the example of a pattern-defined middle transformation, refer to the ``extensions/middle/L2NormToNorm.py`` file. - -.. _generic_middle_phase_transformations: - -Generic Middle Phase Transformations -#################################### - -Model Optimizer provides a mechanism to implement generic middle phase transformations. This type of transformation is -implemented using ``mo.middle.replacement.MiddleReplacementPattern`` as a base class and works similarly to the -:ref:`Generic Front Phase Transformations `. The only difference is that the -transformation entry function name is ``find_and_replace_pattern(self, graph: Graph)``. - -For the example of this transformation, refer to the ``extensions/middle/CheckForCycle.py`` file. - -.. _mo_back_phase_transformations: - -========================== -Back Phase Transformations -========================== - -There are two types of back phase transformations: - -1. :ref:`Pattern-Defined Back Phase Transformations ` triggered for each sub-graph of the original graph, isomorphic to the specified pattern. -2. :ref:`Generic Back Phase Transformations `. - -.. note:: - The graph layout during the back phase is always NCHW. However, during the front and middle phases it could be NHWC if the original model was using it. For more details, refer to :ref:`Model Conversion Pipeline `. - -.. _pattern_defined_back_phase_transformations: - -Pattern-Defined Back Phase Transformations -########################################## - -This type of transformation is implemented using ``mo.back.replacement.MiddleReplacementPattern`` as a base class and -works the same way as :ref:`Pattern-Defined Middle Phase Transformations `. - -For the example of a pattern-defined back transformation, refer to the ``extensions/back/ShufflenetReLUReorder.py`` file. - -.. _generic_back_phase_transformations: - -Generic Back Phase Transformations -################################## - -Model Optimizer provides mechanism to implement generic back phase transformations. This type of transformation is -implemented using ``mo.back.replacement.BackReplacementPattern`` as a base class and works the same way as -:ref:`Generic Middle Phase Transformations `. - -For the example of this transformation, refer to the ``extensions/back/GatherNormalizer.py`` file. - -==================== -Additional Resources -==================== - -* :doc:`Model Optimizer Extensibility <../../legacy-model-optimizer-extensibility>` -* :doc:`Graph Traversal and Modification Using Ports and Connections <../../legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification>` -* :doc:`Model Optimizer Extensions <../[legacy]-model-optimizer-extensions>` -* :doc:`Extending Model Optimizer with Caffe Python Layers <../[legacy]-extending-model-optimizer-with-caffe-python-layers>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-model-optimizer-operation.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-model-optimizer-operation.rst deleted file mode 100644 index 61c43f72dfade9..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-model-optimizer-operation.rst +++ /dev/null @@ -1,110 +0,0 @@ -[LEGACY] Model Optimizer Operation -=================================== - -.. meta:: - :description: Learn about the Op class, that contains operation attributes, - which are set to a node of the graph created during model - conversion with Model Optimizer. - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated TensorFlow conversion method. The guide on the new and recommended method, using a new frontend, can be found in the :doc:`Frontend Extensions <../../../../openvino-extensibility/frontend-extensions>` article. - -Model Optimizer defines a ``mo.ops.Op`` class (``Op`` will be used later in the document to be short), which is a base class -for an operation used in the Model Optimizer. The instance of the ``Op`` class serves several purposes: - -1. Stores the operation attributes. -2. Stores the operation shape/value and type inference functions. -3. Defines operation attributes to be saved to the corresponding IR section. -4. Contains convenient methods to create a graph node from an ``Op`` object instance and connect it with the existing graph. -5. Used in the extractors to store parsed attributes and operation specific attributes in the dedicated graph node. - -It is important to mention that there is no connection between the instance of the ``Op`` class and the ``Node`` object -created from it. The ``Op`` class is just a container for attributes describing the operation. Model Optimizer uses the ``Op`` -class during a model conversion to create a node of the graph with attributes copied from the ``Op`` class instance. Graph -manipulations are performed with graph ``Nodes`` and their attributes and does not involve ``Ops``. - -There are a number of common attributes used in the operations. Below is the list of these attributes with description. - -* ``id`` — **(Mandatory)** — unique identifier of a node in a graph. Generated automatically, equal to the number of nodes in the graph plus 1 if not specified. -* ``name`` — **(Mandatory)** — name of the operation. Generated automatically, equal to the ``id`` if not specified. -* ``type`` — **(Mandatory)** — type of the operation according to the :doc:`opset specification <../../../../openvino-ir-format/operation-sets/available-opsets>`. For the internal Model Optimizer operations, this attribute should be set to ``None``. The model conversion fails if an operation with ``type`` equal to ``None`` comes to the IR emitting phase. -* ``version`` — **(Mandatory)** — the operation set (opset) name the operation belongs to. If not specified, Model Optimizer sets it equal to ``experimental``. For more information about operation sets, refer to :doc:`OpenVINO Model Representation <../../../../../openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation>` section. -* ``op`` — Model Optimizer type of the operation. In many cases, the value of ``type`` is equal to the value of ``op``. However, when Model Optimizer cannot instantiate the opset operation during model loading, it creates an instance of an internal operation. Thus, the attribute ``op`` is used as a type of this internal operation. Later in the pipeline, the node created from an internal operation will be replaced during front, middle or back phase with node(s) created from the opset. -* ``infer`` — the attribute defines a function calculating output tensor(s) shape and optional value(s). The attribute may be set to ``None`` for the internal Model Optimizer operations used during the front phase only. For more information about the shape inference function, refer to the :ref:`Partial Inference `. -* ``type_infer`` — the attribute defines a function calculating output tensor(s) data type. If the attribute is not defined, the default function is used. The function checks if the ``data_type`` node attribute is set and then propagates this type to the output tensor from the **port 0**. Otherwise, it propagates the data type of the tensor coming into the input **port 0** to the output tensor from the **port 0**. -* ``in_ports_count`` — default number of input ports to be created for the operation. Additional ports can be created or redundant ports can be removed using dedicated ``Node`` class API methods. -* ``out_ports_count`` — default number of output ports to be created for the operation. Additional ports can be created or redundant ports can be removed using dedicated ``Node`` class API methods. - -Below is an example of the Model Optimizer class for the :doc:`SoftMax <../../../../openvino-ir-format/operation-sets/operation-specs/activation/softmax-1>` operation from -the ``mo/ops/softmax.py`` file with the comments in code. - -.. code-block:: py - - class Softmax(Op): - # The class attribute defines a name of the operation so the operation class can be obtained using the - # "Op.get_op_class_by_name()" static method - op = 'SoftMax' - - # The operation works as an extractor by default. This is a legacy behavior, currently not recommended for use, - # thus "enabled" class attribute is set to False. The recommended approach is to use dedicated extractor extension. - enabled = False - - def __init__(self, graph: Graph, attrs: dict): - super().__init__(graph, { # The constructor of the base class Op is called with additional default attributes. - 'type': __class__.op, # The operation is from the opset so the type is set to 'SoftMax'. - 'op': __class__.op, # Internal Model Optimizer operation has the same type. - 'version': 'opset1', # The operation corresponds to opset1. - 'infer': Softmax.infer, # Shape inference function is defined below. - 'axis': 1, # Default value for the "axis" attribute of the operation SoftMax. - 'in_ports_count': 1, # The operation has one input. - 'out_ports_count': 1, # The operation produces one output. - }, attrs) - - # The method returns operation specific attributes list. This method is important when implementing - # extractor inherited from CaffePythonFrontExtractorOp class to extract attribute for Caffe Python operation. - # However, it is currently used interchangeably with the "backend_attrs()" method. If the "backend_attrs()" is not used, - # then the "supported_attrs()" is used instead. In this particular case, the operation has just one attribute "axis". - def supported_attrs(self): - return ['axis'] - - @staticmethod - def infer(node: Node): - "some code calculating output shape and values" - -There is a dedicated method called ``backend_attrs()`` defining a list of attributes to be saved to the IR. Consider an -example from the ``mo/ops/pooling.py`` file: - -.. code-block:: py - - def backend_attrs(self): - return [ - ('strides', lambda node: ','.join(map(str, node['stride'][node.spatial_dims]))), - ('kernel', lambda node: ','.join(map(str, node['window'][node.spatial_dims]))), - - ('pads_begin', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 0)))), - ('pads_end', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 1)))), - - ('pool-method', 'pool_method'), - ('exclude-pad', 'exclude_pad'), - - 'rounding_type', - 'auto_pad', - ] - -The ``backend_attrs()`` function returns a list of records. A record can be of one of the following formats: -1. A string defining the attribute to be saved to the IR. If the value of the attribute is ``None``, the attribute is not saved. Examples of this case are ``rounding_type`` and ``auto_pad``. -2. A tuple, where the first element is a string defining the name of the attribute as it will appear in the IR and the second element is a function to produce the value for this attribute. The function gets an instance of the ``Node`` as the only parameter and returns a string with the value to be saved to the IR. Examples of this case are ``strides``, ``kernel``, ``pads_begin`` and ``pads_end``. -3. A tuple, where the first element is a string defining the name of the attribute as it will appear in the IR and the second element is the name of the ``Node`` attribute to get the value from. Examples of this case are ``pool-method`` and ``exclude-pad``. - -==================== -Additional Resources -==================== - -* :doc:`Model Optimizer Extensibility <../../legacy-model-optimizer-extensibility>` -* :doc:`Graph Traversal and Modification Using Ports and Connections <../../legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification>` -* :doc:`Model Optimizer Extensions <../[legacy]-model-optimizer-extensions>` -* :doc:`Extending Model Optimizer with Caffe Python Layers <../[legacy]-extending-model-optimizer-with-caffe-python-layers>` - diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-optimizer-extractor.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-optimizer-extractor.rst deleted file mode 100644 index 5de7ae93f86a7c..00000000000000 --- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility/[legacy]-model-optimizer-extensions/[legacy]-optimizer-extractor.rst +++ /dev/null @@ -1,113 +0,0 @@ -[LEGACY] Operation Extractor -============================= - -.. meta:: - :description: Learn about a deprecated generic extension in Model Optimizer, - which provides the operation extractor usable for all model - frameworks. - - -.. danger:: - - The code described here has been **deprecated!** Do not use it to avoid working with a legacy solution. It will be kept for some time to ensure backwards compatibility, but **you should not use** it in contemporary applications. - - This guide describes a deprecated TensorFlow conversion method. The guide on the new and recommended method, using a new frontend, can be found in the :doc:`Frontend Extensions <../../../../openvino-extensibility/frontend-extensions>` article. - -Model Optimizer runs specific extractor for each operation in the model during the model loading. - -There are several types of Model Optimizer extractor extensions: - -1. The generic one, which is described in this article. -2. The special extractor for Caffe models with Python layers. This kind of extractor is described in the :doc:`Extending Model Optimizer with Caffe Python Layers <../[legacy]-extending-model-optimizer-with-caffe-python-layers>` guide. - -Generic extension provides a generic mechanism for the operation extractor applicable for all frameworks. Model Optimizer provides the ``mo.front.extractor.FrontExtractorOp`` class as a base class to implement the extractor. It has the ``extract`` class method, which gets the only parameter ``Node``, which corresponds to the graph node to extract data from. The operation description in the original framework format is stored in the attribute ``pb`` of the node. The extractor goal is to parse this attribute and save necessary attributes to the corresponding node of the graph. Consider the extractor for the ``Const`` TensorFlow operation (refer to the ``extensions/front/tf/const_ext.py`` file): - -.. code-block:: py - :force: - - from openvino.tools.mo.front.extractor import FrontExtractorOp - from openvino.tools.mo.front.tf.extractors.utils import tf_dtype_extractor, tf_tensor_shape, tf_tensor_content - from openvino.tools.mo.ops.const import Const - - - class ConstExtractor(FrontExtractorOp): - # The "op" class attribute defines a type of the operation in the framework (in this case it is a TensorFlow), - # for which the extractor should be triggered. - op = 'Const' - enabled = True # The flag that indicates that this extractor is enabled. - - @classmethod - def extract(cls, node): # The entry point of the extractor. - # The `node.pb` attribute stores the TensorFlow representation of the operation, which is a Protobuf message of the - # specific format. In particular, the message contains the attribute called "value" containing the description of - # the constant. The string "pb.attr["value"].tensor" is just a Python binding for Protobuf message parsing. - pb_tensor = node.pb.attr["value"].tensor - # Get the shape of the tensor from the protobuf message, using the helper function "tf_tensor_shape". - shape = tf_tensor_shape(pb_tensor.tensor_shape) - # Create a dictionary with necessary attributes. - attrs = { - 'shape': shape, - # Get the tensor value, using "tf_tensor_content" helper function. - 'value': tf_tensor_content(pb_tensor.dtype, shape, pb_tensor), - # Get the tensor data type, using "tf_dtype_extractor" helper function. - 'data_type': tf_dtype_extractor(pb_tensor.dtype), - } - # Update the node attributes, using default attributes from the "Const" operation and attributes saved to the - # "attrs" dictionary. - Const.update_node_stat(node, attrs) - return cls.enabled - -Consider another example with an extractor of the ``Constant`` ONNX operation (refer to the ``extensions/front/onnx/const_ext.py`` file): - -.. code-block:: py - :force: - - from onnx import numpy_helper - from onnx.numpy_helper import to_array - - from openvino.tools.mo.front.extractor import FrontExtractorOp - from openvino.tools.mo.front.onnx.extractors.utils import onnx_attr - from openvino.tools.mo.ops.const import Const - - - class ConstantExtractor(FrontExtractorOp): - op = 'Constant' - enabled = True - - @classmethod - def extract(cls, node): - # Use "onnx_attr" helper method, which parses the Protobuf representation of the operation saved in the "node". - # Gets the value of the attribute with name "value" as "TensorProto" type (specified with a keyword "t"). - pb_value = onnx_attr(node, 'value', 't') - # Use "numpy_helper.to_array()" ONNX helper method to convert "TensorProto" object to a numpy array. - value = numpy_helper.to_array(pb_value) - - attrs = { - 'data_type': value.dtype, - 'value': value, - } - # Update the node attributes, using default attributes from the "Const" operation and attributes saved to the - # "attrs" dictionary. - Const.update_node_stat(node, attrs) - return cls.enabled - -The extractors for operations from different frameworks work similarly. The only difference is in the helper methods used to parse operation attributes encoded with a framework-specific representation. - -A common practice is to use ``update_node_stat()`` method of the dedicated ``Op`` class to update the node attributes. This method does the following: - -1. Sets values for common attributes like ``op``, ``type``, ``infer``, ``in_ports_count``, ``out_ports_count``, ``version`` to values specific to the dedicated operation (``Const`` operation in this case). -2. Uses ``supported_attrs()`` and ``backend_attrs()`` methods, defined in the ``Op`` class to update specific node attribute ``IE``. The IR emitter uses the value stored in the ``IE`` attribute to pre-process attribute values and save them to IR. -3. Optionally sets additional attributes provided to the ``update_node_stat()`` function as a second parameter. Usually these attributes are parsed from the particular instance of the operation. - -.. note:: - Model Optimizer uses numpy arrays to store values and numpy arrays of ``np.int64`` type to store shapes in the graph. - -==================== -Additional Resources -==================== - -* :doc:`Model Optimizer Extensibility <../../legacy-model-optimizer-extensibility>` -* :doc:`Graph Traversal and Modification Using Ports and Connections <../../legacy-model-optimizer-extensibility/[legacy]-graph-traversal-and-modification>` -* :doc:`Model Optimizer Extensions <../[legacy]-model-optimizer-extensions>` -* :doc:`Extending Model Optimizer with Caffe Python Layers <../[legacy]-extending-model-optimizer-with-caffe-python-layers>` - diff --git a/docs/articles_en/documentation/openvino-ecosystem.rst b/docs/articles_en/documentation/openvino-ecosystem.rst index 6735192e95f674..1975fe0a48a181 100644 --- a/docs/articles_en/documentation/openvino-ecosystem.rst +++ b/docs/articles_en/documentation/openvino-ecosystem.rst @@ -12,6 +12,7 @@ OpenVINO™ Ecosystem Overview :hidden: openvino-ecosystem/openvino-training-extensions + openvino-ecosystem/openvino-test-drive openvino-ecosystem/datumaro openvino-ecosystem/openvino-security-add-on @@ -102,20 +103,19 @@ development process, empowering teams to produce custom AI models at scale. |hr| -| **Tokenizers** -| :bdg-link-dark:`Github ` - :bdg-link-success:`User Guide ` +| **Intel® Test Drive** +| :bdg-link-dark:`Github ` -OpenVINO Tokenizers add text processing operations to OpenVINO. +OpenVINO™ Test Drive is cross-platform graphic user interface application that enables running +generative AI and vision models directly on your computer or edge device using OpenVINO™ Runtime. |hr| -| **OpenVINO's Open Model Zoo** -| :bdg-link-dark:`Github ` - :bdg-link-success:`User Guide ` +| **Tokenizers** +| :bdg-link-dark:`Github ` + :bdg-link-success:`User Guide ` -Open Model Zoo includes optimized deep learning models and a set of demos to -expedite development of high-performance deep learning inference applications. +OpenVINO Tokenizers add text processing operations to OpenVINO. OpenVINO-based AI projects ########################## diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst index 3959ebefb09a4a..043f05a90e2342 100644 --- a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst +++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst @@ -735,7 +735,7 @@ How to Use the OpenVINO™ Security Add-on This section requires interactions between the Model Developer/Independent Software vendor and the User. All roles must complete all applicable :ref:`set up steps ` and :ref:`installation steps ` before beginning this section. -This document uses the `face-detection-retail-0004 `__ model as an example. +This document uses a face-detection model as an example. The following figure describes the interactions between the Model Developer, Independent Software Vendor, and User. @@ -793,15 +793,8 @@ Step 2: Create a key store and add a certificate to it Step 3: Create the model ------------------------ -This example uses ``curl`` to download the ``face-detection-retail-004`` model from the OpenVINO Model Zoo. If you are behind a firewall, check and set your proxy settings. - -Download a model from the Model Zoo: - -.. code-block:: sh - - curl --create-dirs https://download.01.org/opencv/2021/openvinotoolkit/2021.1/open_../legacy-features/model-zoo/models_bin/1/face-detection-retail-0004/FP32/face-detection-retail-0004.xml https://download.01.org/opencv/2021/openvinotoolkit/2021.1/open_../legacy-features/model-zoo/models_bin/1/face-detection-retail-0004/FP32/face-detection-retail-0004.bin -o model/face-detection-retail-0004.xml -o model/face-detection-retail-0004.bin - -The model is downloaded to the ``OVSA_DEV_ARTEFACTS/model`` directory +Download a `model `__ in OpenVINO IR format to +the ``OVSA_DEV_ARTEFACTS/model`` directory. Step 4: Define access control for the model and create a master license for it ------------------------------------------------------------------------------- @@ -811,9 +804,9 @@ Define and enable the model access control and master license: .. code-block:: sh uuid=$(uuidgen) - /opt/ovsa/bin/ovsatool controlAccess -i model/face-detection-retail-0004.xml model/face-detection-retail-0004.bin -n "face detection" -d "face detection retail" -v 0004 -p face_detection_model.dat -m face_detection_model.masterlic -k isv_keystore -g $uuid + /opt/ovsa/bin/ovsatool controlAccess -i model/.xml model/.bin -n "name of the model" -d "detailed name of the model" -p .dat -m .masterlic -k isv_keystore -g $uuid -The Intermediate Representation files for the ``face-detection-retail-0004`` model are encrypted as ``face_detection_model.dat`` and a master license is generated as ``face_detection_model.masterlic`` +The Intermediate Representation files for the model are encrypted as ``.dat`` and a master license is generated as ``.masterlic`` Step 5: Create a Runtime Reference TCB -------------------------------------- @@ -824,7 +817,7 @@ Generate the reference TCB for the runtime .. code-block:: sh - /opt/ovsa/bin/ovsaruntime gen-tcb-signature -n "Face Detect @ Runtime VM" -v "1.0" -f face_detect_runtime_vm.tcb -k isv_keystore + /opt/ovsa/bin/ovsaruntime gen-tcb-signature -n "Face Detect @ Runtime VM" -v "1.0" -f model_inference_runtime_vm.tcb -k isv_keystore Step 6: Publish the access controlled Model and Runtime Reference TCB @@ -856,7 +849,7 @@ Step 7: Receive a User Request .. code-block:: sh cd $OVSA_DEV_ARTEFACTS - /opt/ovsa/bin/ovsatool sale -m face_detection_model.masterlic -k isv_keystore -l 30daylicense.config -t face_detect_runtime_vm.tcb -p custkeystore.csr.crt -c face_detection_model.lic + /opt/ovsa/bin/ovsatool sale -m .masterlic -k isv_keystore -l 30daylicense.config -t detect_runtime_vm.tcb -p custkeystore.csr.crt -c .lic 4. Update the license server database with the license. @@ -864,13 +857,13 @@ Step 7: Receive a User Request .. code-block:: sh cd /opt/ovsa/DB - python3 ovsa_store_customer_lic_cert_db.py ovsa.db $OVSA_DEV_ARTEFACTS/face_detection_model.lic $OVSA_DEV_ARTEFACTS/custkeystore.csr.crt + python3 ovsa_store_customer_lic_cert_db.py ovsa.db $OVSA_DEV_ARTEFACTS/.lic $OVSA_DEV_ARTEFACTS/custkeystore.csr.crt 5. Provide these files to the User: - * ``face_detection_model.dat`` - * ``face_detection_model.lic`` + * ``.dat`` + * ``.lic`` Model User Instructions +++++++++++++++++++++++ @@ -930,14 +923,14 @@ Step 4: Receive and load the access controlled model into the OpenVINO™ Model 1. Receive the model as files named: - * face_detection_model.dat - * face_detection_model.lic + * .dat + * .lic .. code-block:: sh cd $OVSA_RUNTIME_ARTEFACTS - scp username@://OVSA/artefacts/face_detection_model.dat . - scp username@://OVSA/artefacts/face_detection_model.lic . + scp username@://OVSA/artefacts/.dat . + scp username@://OVSA/artefacts/.lic . 2. Prepare the environment: @@ -954,8 +947,8 @@ Step 4: Receive and load the access controlled model into the OpenVINO™ Model .. code-block:: sh cd $OVSA_RUNTIME_ARTEFACTS/../ovms - cp $OVSA_RUNTIME_ARTEFACTS/face_detection_model.dat model/fd/1/. - cp $OVSA_RUNTIME_ARTEFACTS/face_detection_model.lic model/fd/1/. + cp $OVSA_RUNTIME_ARTEFACTS/.dat model/fd/1/. + cp $OVSA_RUNTIME_ARTEFACTS/.lic model/fd/1/. cp $OVSA_RUNTIME_ARTEFACTS/custkeystore model/fd/1/. 4. Rename and edit ``sample.json`` to include the names of the access controlled model artefacts you received from the Model Developer. The file looks like this: @@ -976,7 +969,7 @@ Step 4: Receive and load the access controlled model into the OpenVINO™ Model "config":{ "name":"controlled-access-model", "base_path":"/sampleloader/model/fd", - "custom_loader_options": {"loader_name": "ovsa", "keystore": "custkeystore", "controlled_access_file": "face_detection_model"} + "custom_loader_options": {"loader_name": "ovsa", "keystore": "custkeystore", "controlled_access_file": ""} } } ] @@ -1010,7 +1003,7 @@ Step 6: Prepare to run Inference pip3 install futures==3.1.1 pip3 install tensorflow-serving-api==1.14.0 -3. Copy the ``face_detection.py`` from the example_client in ``/opt/ovsa/example_client`` +3. Copy the ``detection.py`` from the example_client in ``/opt/ovsa/example_client`` .. code-block:: sh @@ -1027,11 +1020,11 @@ Step 6: Prepare to run Inference Step 7: Run Inference --------------------- -Run the ``face_detection.py`` script: +Run the ``detection.py`` script: .. code-block:: sh - python3 face_detection.py --grpc_port 3335 --batch_size 1 --width 300 --height 300 --input_images_dir images --output_dir results --tls --server_cert /var/OVSA/Modelserver/server.pem --client_cert /var/OVSA/Modelserver/client.pem --client_key /var/OVSA/Modelserver/client.key --model_name controlled-access-model + python3 detection.py --grpc_port 3335 --batch_size 1 --width 300 --height 300 --input_images_dir images --output_dir results --tls --server_cert /var/OVSA/Modelserver/server.pem --client_cert /var/OVSA/Modelserver/client.pem --client_key /var/OVSA/Modelserver/client.key --model_name controlled-access-model Summary diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst new file mode 100644 index 00000000000000..527a01bf38a6cf --- /dev/null +++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst @@ -0,0 +1,109 @@ +=============================================================================================== +OpenVINO™ Test Drive +=============================================================================================== + + +.. meta:: + :description: See how to test your models with OpenVINO, using a simple graphic interface of + Test Drive. + + + +OpenVINO™ Test Drive is a cross-platform graphic user interface application for running and +testing AI models, both generative and vision based. +It can run directly on your computer or on edge devices using +`OpenVINO™ Runtime `__. + +OpenVINO™ Test Drive is developed under the `openvino_testdrive repository `__. + +Use OpenVINO™ Test Drive to: + +* **Chat with LLMs** and evaluate model performance on your computer or edge device; +* **Experiment with different text prompts** to generate images, using Stable + Diffusion and Stable DiffusionXL models (coming soon); +* **Transcribe speech from video**, using Whisper models, including generation + of timestamps (coming soon); +* **Run inference of models** trained by Intel® Geti™ and **visualize the results**. + + + +Installation (Windows) +############################################################################################### + +1. Download the latest archive from the + `release repository `__. + To verify the integrity of the downloaded package, use the SHA-256 file attached. + +2. Extract the zip file and run the *MSIX* installation package. Click the `Install` button to + proceed. + +3. Launch OpenVINO™ Test Drive, clicking the application name in the Windows app list. + + +Quick start +############################################################################################### + +When starting the application, you can import an LLM model from Hugging Face Hub +or upload an Intel® Geti™ model from a local drive. + +Inference of models from Hugging Face ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +1. Find a model on `Hugging Face `__ and import it. + +2. Chat with LLMs via the `Playground` tab. + +3. Use the `Performance metrics` tab to get model performance metrics on your + computer or an edge device. + + + +Inference of models trained with Intel® Geti™ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +1. Download the deployment code for a model in the OpenVINO IR format trained + by Intel® Geti™ (refer to the `Intel® Geti™ documentation `__ + for more details). + +2. Import the deployment code into OpenVINO™ Test Drive, using the *Import model* and then + *Local disk* buttons. + +3. Use the *Live inference* tab to run and visualize results of inference of individual images. + +4. For batch inference, use the *Batch inference* tab and provide paths to the folder + with input images, as well as one for batch inference results. You can do so by filling out + the *Source folder* and *Destination folder* fields. Click *Start* to start batch inference. + + +Build the Application +############################################################################################### + +1. Make sure you `Install flutter SDK `__ + and all its platform-specific dependencies. +2. Build the bindings and place them in the **./bindings** folder. + + OpenVINO™ Test Drive uses bindings to `OpenVINO™ GenAI `__ + and `OpenVINO™ Model API `__, + which are located in the **./openvino_bindings** folder. Refer to the + `GitHub page `__ + for more details. + +3. Start the application, using the following command: + + .. code-block:: console + + flutter run + +Additional Resources +############################################################################################### + +- `OpenVINO™ `__ - a software toolkit + for optimizing and deploying deep learning models. +- `GenAI Repository `__ and + `OpenVINO Tokenizers `__ + - resources and tools for developing and optimizing Generative AI applications. +- `Intel® Geti™ `__ - software for building computer + vision models. +- `OpenVINO™ Model API `__ + - a set of wrapper classes for particular tasks and model architectures. + It simplifies routine procedures, preprocessing and postprocessing of data. diff --git a/docs/articles_en/documentation/openvino-extensibility.rst b/docs/articles_en/documentation/openvino-extensibility.rst index 216135009b1806..6b2d0878bb687c 100644 --- a/docs/articles_en/documentation/openvino-extensibility.rst +++ b/docs/articles_en/documentation/openvino-extensibility.rst @@ -32,7 +32,7 @@ Custom operations, which are not included in the list, are not recognized by Ope 1. A new or rarely used regular framework operation is not supported in OpenVINO yet. 2. A new user operation that was created for some specific model topology by the author of the model using framework extension capabilities. -Importing models with such operations requires additional steps. This guide illustrates the workflow for running inference on models featuring custom operations. This allows plugging in your own implementation for them. OpenVINO Extensibility API enables adding support for those custom operations and using one implementation for Model Optimizer and OpenVINO Runtime. +Importing models with such operations requires additional steps. This guide illustrates the workflow for running inference on models featuring custom operations. This allows plugging in your own implementation for them. OpenVINO Extensibility API enables adding support for those custom operations and using one implementation for model conversion API and OpenVINO Runtime. Defining a new custom operation basically consists of two parts: @@ -45,7 +45,7 @@ The first part is required for inference. The second part is required for succes Definition of Operation Semantics ################################# -If the custom operation can be mathematically represented as a combination of exiting OpenVINO operations and such decomposition gives desired performance, then low-level operation implementation is not required. Refer to the latest OpenVINO operation set, when deciding feasibility of such decomposition. You can use any valid combination of exiting operations. The next section of this document describes the way to map a custom operation. +If the custom operation can be mathematically represented as a combination of existing OpenVINO operations and such decomposition gives desired performance, then low-level operation implementation is not required. Refer to the latest OpenVINO operation set, when deciding feasibility of such decomposition. You can use any valid combination of existing operations. The next section of this document describes the way to map a custom operation. If such decomposition is not possible or appears too bulky with a large number of constituent operations that do not perform well, then a new class for the custom operation should be implemented, as described in the :doc:`Custom Operation Guide `. @@ -56,21 +56,9 @@ Mapping from Framework Operation Mapping of custom operation is implemented differently, depending on model format used for import. If a model is represented in the ONNX (including models exported from PyTorch in ONNX), TensorFlow Lite, PaddlePaddle or -TensorFlow formats, then one of the classes from :doc:`Frontend Extension API ` -should be used. It consists of several classes available in C++ which can be used with the ``--extensions`` option in Model Optimizer -or when a model is imported directly to OpenVINO runtime using the ``read_model`` method. -Python API is also available for runtime model import. +TensorFlow formats, then you should use one of the classes from :doc:`Frontend Extension API `, +the application of which is described below. -If you are implementing extensions for new ONNX, PaddlePaddle, TensorFlow Lite or TensorFlow frontends and plan to use the ``--extensions`` -option in Model Optimizer for model conversion, then the extensions should be: - -1. Implemented in C++ only. - -2. Compiled as a separate shared library (see details on how to do this further in this guide). - -Model Optimizer does not support new frontend extensions written in Python API. - -Remaining part of this guide describes application of Frontend Extension API for new frontends. Registering Extensions ###################### @@ -104,7 +92,7 @@ Extensions can be loaded from a code with the ``ov::Core::add_extension`` metho :fragment: [add_extension] -The ``Identity`` is a custom operation class defined in :doc:`Custom Operation Guide `. This is sufficient to enable reading OpenVINO IR which uses the ``Identity`` extension operation emitted by Model Optimizer. In order to load original model directly to the runtime, add a mapping extension: +The ``Identity`` is a custom operation class defined in :doc:`Custom Operation Guide `. This is sufficient to enable reading OpenVINO IR which uses the ``Identity`` extension operation. In order to load original model directly to the runtime, add a mapping extension: .. tab-set:: @@ -133,11 +121,11 @@ Create a Library with Extensions An extension library should be created in the following cases: -* Conversion of a model with custom operations in Model Optimizer. +* Conversion of a model with custom operations in model conversion API * Loading a model with custom operations in a Python application. This applies to both framework model and OpenVINO IR. * Loading models with custom operations in tools that support loading extensions from a library, for example the ``benchmark_app``. -To create an extension library, for example, to load the extensions into Model Optimizer, perform the following: +To create an extension library, perform the following: 1. Create an entry point for extension library. OpenVINO provides the ``OPENVINO_CREATE_EXTENSIONS()`` macro, which allows to define an entry point to a library with OpenVINO Extensions. This macro should have a vector of all OpenVINO Extensions as an argument. diff --git a/docs/articles_en/documentation/openvino-extensibility/custom-gpu-operations.rst b/docs/articles_en/documentation/openvino-extensibility/custom-gpu-operations.rst index 92914223ac123c..9717c6c8ac4e33 100644 --- a/docs/articles_en/documentation/openvino-extensibility/custom-gpu-operations.rst +++ b/docs/articles_en/documentation/openvino-extensibility/custom-gpu-operations.rst @@ -40,8 +40,8 @@ There are two options for using the custom operation configuration file: :fragment: [part0] -All OpenVINO samples, except the trivial ``hello_classification``, and most Open -Model Zoo demos feature a dedicated command-line option ``-c`` to load custom kernels. +All OpenVINO samples, except the trivial ``hello_classification``, +feature a dedicated command-line option ``-c`` to load custom kernels. For example, to load custom operations for the classification sample, run the command below: .. code-block:: cpp @@ -49,11 +49,6 @@ For example, to load custom operations for the classification sample, run the co $ ./classification_sample -m /bvlc_alexnet_fp16.xml -i ./validation_set/daily/227x227/apron.bmp -d GPU -c /custom_layer_example.xml -.. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. - .. _config-file-format: @@ -393,3 +388,7 @@ execution ends. For more information, refer to the `printf Function `__. +Additional Resources +#################### + +* Models in the OpenVINO IR format published on `Hugging Face `__. diff --git a/docs/articles_en/documentation/openvino-extensibility/frontend-extensions.rst b/docs/articles_en/documentation/openvino-extensibility/frontend-extensions.rst index 115f149657821c..08b7c6f6b98018 100644 --- a/docs/articles_en/documentation/openvino-extensibility/frontend-extensions.rst +++ b/docs/articles_en/documentation/openvino-extensibility/frontend-extensions.rst @@ -14,9 +14,6 @@ Refer to :doc:`Introduction to OpenVINO Extension <../openvino-extensibility>` t understand the entire flow. This API is applicable to new frontends only, which exist for ONNX, TensorFlow Lite, PaddlePaddle, and TensorFlow. -If a different model format is used, follow legacy -:doc:`Model Optimizer Extensions <../legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility>` -guide. .. note:: diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst index 9451fabd6219d8..4b64b2177af361 100644 --- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst +++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst @@ -312,17 +312,11 @@ This step is optional. It modifies the transformation function to a device-speci Result model overview ##################### -Let's explore quantized `TensorFlow implementation of ResNet-50 `__ model. Use `Model Downloader `__ tool to download the ``fp16`` model from `OpenVINO™ Toolkit - Open Model Zoo repository `__: - -.. code-block:: sh - - omz_downloader --name resnet-50-tf --precisions FP16-INT8 - -After that you should quantize model by the `Model Quantizer `__ tool. - -.. code-block:: sh - - omz_quantizer --model_dir public/resnet-50-tf --dataset_dir --precisions=FP16-INT8 +Let's explore the resnet-50-tf model, quantized to ``fp16``, which is a TensorFlow +implementation of `ResNet-50 `__ +- an image classification model pre-trained on the ImageNet dataset. Originally +redistributed in the "Saved model" format, converted to a frozen graph using the +"tf.graph_util" module. Inference @@ -346,7 +340,7 @@ Result model depends on different factors: Information about layer precision is stored in the performance counters that are -available from the OpenVINO Runtime API. For example, the part of performance counters table for quantized `TensorFlow implementation of ResNet-50 `__ model inference on CPU Plugin looks as follows: +available from the OpenVINO Runtime API. For example, the part of performance counters table for the resnet-50-tf model inferred on CPU Plugin looks as follows: .. list-table:: :header-rows: 1 diff --git a/docs/articles_en/documentation/openvino-security.rst b/docs/articles_en/documentation/openvino-security.rst index 99cf13161bf243..03a99ba49e89e2 100644 --- a/docs/articles_en/documentation/openvino-security.rst +++ b/docs/articles_en/documentation/openvino-security.rst @@ -55,7 +55,8 @@ Hardware-based protection such as Intel Software Guard Extensions (Intel SGX) ca decryption operation secrets and bind them to a device. For more information, see the `Intel Software Guard Extensions `__. -Use the ``ov::Core::read_model`` to set model representations and weights respectively. +Use the `ov::Core::read_model <../api/c_cpp_api/group__ov__dev__exec__model.html#classov_1_1_core_1ae0576a95f841c3a6f5e46e4802716981>`__ +to set model representations and weights respectively. Currently there is no way to read external weights from memory for ONNX models. The ``ov::Core::read_model(const std::string& model, const Tensor& weights)`` method @@ -65,10 +66,24 @@ should be called with ``weights`` passed as an empty ``ov::Tensor``. :language: cpp :fragment: part1 + +Encrypted models that have already been compiled, in the form of blob files, +can be loaded using the +`ov::Core::import_model <../api/c_cpp_api/group__ov__runtime__cpp__api.html#_CPPv4N2ov4Core12import_modelERNSt7istreamERKNSt6stringERK6AnyMap>`__ +method, as shown in the code sample below: + +.. code-block:: cpp + + ov::Core core; + // Import a model from a blob. + std::ifstream compiled_blob(blob, std::ios_base::in | std::ios_base::binary); + auto compiled_model = core.import_model(compiled_blob, "CPU"); + + Additional Resources #################### - Intel® Distribution of OpenVINO™ toolkit `home page `__. -- :doc:`Convert a Model `. +- :doc:`Convert a Model <../openvino-workflow/model-preparation/convert-model-to-ir>`. - :doc:`OpenVINO™ Runtime User Guide <../openvino-workflow/running-inference>`. - For more information on Sample Applications, see the :doc:`OpenVINO Samples Overview <../learn-openvino/openvino-samples>` diff --git a/docs/articles_en/get-started.rst b/docs/articles_en/get-started.rst index 28a39d3c0a4e84..9b46cc416605f3 100644 --- a/docs/articles_en/get-started.rst +++ b/docs/articles_en/get-started.rst @@ -62,14 +62,14 @@ OpenVINO provides a wide array of examples and documentation showing how to work OpenVINO Basics +++++++++++++++ -Learn the basics of working with models and inference in OpenVINO. Begin with “Hello World” Interactive Tutorials that show how to prepare models, run inference, and retrieve results using the OpenVINO API. Then, explore other examples from the Open Model Zoo and OpenVINO Code Samples that can be adapted for your own application. +Learn the basics of working with models and inference in OpenVINO. Begin with “Hello World” Interactive Tutorials that show how to prepare models, run inference, and retrieve results using the OpenVINO API. Then, explore OpenVINO Code Samples that can be adapted for your own application. .. _interactive-learn-openvino/interactive-tutorials-python: Interactive Tutorials - Jupyter Notebooks ----------------------------------------- -Start with :doc:`interactive Python ` that show the basics of model inferencing, the OpenVINO API, how to convert models to OpenVINO format, and more. +Start with :doc:`interactive Python ` that show the basics of model inference, the OpenVINO API, how to convert models to OpenVINO format, and more. * `Hello Image Classification `__ - Load an image classification model in OpenVINO and use it to apply a label to an image * `OpenVINO Runtime API Tutorial `__ - Learn the basic Python API for working with models in OpenVINO diff --git a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst index e10a67fddadb53..4d1eb37007f59d 100644 --- a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst +++ b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst @@ -44,6 +44,7 @@ Below are the instructions on how to install the OpenCL packages on supported Li .. code-block:: sh apt-get install -y ocl-icd-libopencl1 intel-opencl-icd intel-level-zero-gpu level-zero + sudo usermod -a -G render $LOGNAME .. tab-item:: Ubuntu 20.04 LTS :sync: ubuntu-20 @@ -57,6 +58,7 @@ Below are the instructions on how to install the OpenCL packages on supported Li echo 'deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu focal-legacy main' | tee /etc/apt/sources.list.d/intel.gpu.focal.list && \ apt-get update apt-get update && apt-get install -y --no-install-recommends intel-opencl-icd intel-level-zero-gpu level-zero + sudo usermod -a -G render $LOGNAME Alternatively, download older `deb` version from `here `__. Note that older driver version might not include some of the bug fixes and might be not supported on some latest platforms. Check the supported hardware for the versions you are installing. @@ -135,6 +137,6 @@ Additional Resources * `Docker CI framework for Intel® Distribution of OpenVINO™ toolkit `__ * `Get Started with DockerHub CI for Intel® Distribution of OpenVINO™ toolkit `__ * `Dockerfiles with Intel® Distribution of OpenVINO™ toolkit `__ - +* `GPU Driver issue troubleshoot ` diff --git a/docs/articles_en/get-started/configurations/genai-dependencies.rst b/docs/articles_en/get-started/configurations/genai-dependencies.rst index 59d29ef3108da0..4486890c3a40b8 100644 --- a/docs/articles_en/get-started/configurations/genai-dependencies.rst +++ b/docs/articles_en/get-started/configurations/genai-dependencies.rst @@ -4,12 +4,12 @@ OpenVINO™ GenAI Dependencies OpenVINO™ GenAI depends on both `OpenVINO `__ and `OpenVINO Tokenizers `__. During OpenVINO™ GenAI installation from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers -are used (e.g. ``openvino==2024.5.0`` and ``openvino-tokenizers==2024.5.0.0`` are installed for -``openvino-genai==2024.5.0``). +are used (e.g. ``openvino==2024.6.0`` and ``openvino-tokenizers==2024.6.0.0`` are installed for +``openvino-genai==2024.6.0``). -Trying to update any of the dependency packages might result in a version incompatiblibty +Trying to update any of the dependency packages might result in a version incompatibility due to different Application Binary Interfaces (ABIs), which will result in errors while running -OpenVINO GenAI. Having package version in the ``...`` format, allows +OpenVINO GenAI. Having package version in the ``...`` format, enables changing the ```` portion of the full version to ensure ABI compatibility. Changing ````, ```` or ```` part of the version may break ABI. diff --git a/docs/articles_en/get-started/install-openvino.rst b/docs/articles_en/get-started/install-openvino.rst index 48ea0a434c5388..7603adf37b7e89 100644 --- a/docs/articles_en/get-started/install-openvino.rst +++ b/docs/articles_en/get-started/install-openvino.rst @@ -1,4 +1,4 @@ -Install OpenVINO™ 2024.5 +Install OpenVINO™ 2024.6 ========================== @@ -21,12 +21,12 @@ Install OpenVINO™ 2024.5 - + -OpenVINO 2024.5, described here, is not a Long-Term-Support version! +OpenVINO 2024.6, described here, is not a Long-Term-Support version! All currently supported versions are: -* 2024.5 (development) +* 2024.6 (development) * 2023.3 (LTS) @@ -38,20 +38,7 @@ All currently supported versions are: :doc:`Install OpenVINO GenAI Flavor <../learn-openvino/llm_inference_guide/genai-guide>` and :doc:`Run LLMs with OpenVINO GenAI Flavor <../learn-openvino/llm_inference_guide/genai-guide>`. -.. dropdown:: Deprecation of OpenVINO™ Development Tools Package - - The OpenVINO™ Development Tools package has been deprecated and removed from the default - installation options. For new projects, the OpenVINO runtime package now includes - all necessary components. - - The OpenVINO Development Tools is still available for older versions of OpenVINO, - as well as the current one, from the GitHub repository and PyPI. :doc:`Learn more <../documentation/legacy-features/install-dev-tools>`. - .. dropdown:: Building OpenVINO from Source OpenVINO Toolkit source files are available on GitHub as open source. If you want to build your own version of OpenVINO for your platform, follow the `OpenVINO Build Instructions `__. - - - - diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst b/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst index 20965f2f22d095..77b23ca9b2d6a4 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst @@ -277,4 +277,4 @@ Additional Resources * Converting models for use with OpenVINO™: :doc:`Convert a Model <../../../openvino-workflow/model-preparation>` * Writing your own OpenVINO™ applications: :doc:`OpenVINO™ Runtime User Guide <../../../openvino-workflow/running-inference>` * Sample applications: :doc:`OpenVINO™ Toolkit Samples Overview <../../../learn-openvino/openvino-samples>` -* Pre-trained deep learning models: :doc:`Overview of OpenVINO™ Toolkit Pre-Trained Models <../../../documentation/legacy-features/model-zoo>` +* Pre-trained deep learning models on `Hugging Face `__. diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst b/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst index e4bff378106122..b02d7f4f1984fc 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst @@ -190,4 +190,4 @@ Additional Resources * :doc:`Convert models for use with OpenVINO™ <../../../openvino-workflow/model-preparation/convert-model-to-ir>` * :doc:`Write your own OpenVINO™ applications <../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` * Sample applications: :doc:`OpenVINO™ Toolkit Samples Overview <../../../learn-openvino/openvino-samples>` -* Pre-trained deep learning models: :doc:`Overview of OpenVINO™ Toolkit Pre-Trained Models <../../../documentation/legacy-features/model-zoo>` +* Pre-trained deep learning models on `Hugging Face `__ diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst b/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst index 9db280ec81472e..bdcd89d6b195b1 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst @@ -213,4 +213,4 @@ Additional Resources * :doc:`Convert models for use with OpenVINO™ <../../../openvino-workflow/model-preparation/convert-model-to-ir>` * :doc:`Write your own OpenVINO™ applications <../../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` * Sample applications: :doc:`OpenVINO™ Toolkit Samples Overview <../../../learn-openvino/openvino-samples>` -* Pre-trained deep learning models: :doc:`Overview of OpenVINO™ Toolkit Pre-Trained Models <../../../documentation/legacy-features/model-zoo>` +* Pre-trained deep learning models on `Hugging Face `__. diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-brew.rst b/docs/articles_en/get-started/install-openvino/install-openvino-brew.rst index b1710f3bb358e8..612a873e4ff5ed 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-brew.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-brew.rst @@ -59,14 +59,7 @@ Now that you've installed OpenVINO Runtime, you can try the following things: * Learn more about :doc:`OpenVINO Workflow <../../../openvino-workflow>`. * To prepare your models for working with OpenVINO, see :doc:`Model Preparation <../../../openvino-workflow/model-preparation>`. -* See pre-trained deep learning models in our - :doc:`Open Model Zoo <../../../documentation/legacy-features/model-zoo>`. - - .. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. - +* See pre-trained deep learning models on `Hugging Face `__. * Learn more about :doc:`Inference with OpenVINO Runtime <../../../openvino-workflow/running-inference>`. * See sample applications in :doc:`OpenVINO toolkit Samples Overview <../../../learn-openvino/openvino-samples>`. * Check out the OpenVINO `product home page `__. diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-conda.rst b/docs/articles_en/get-started/install-openvino/install-openvino-conda.rst index d1392d3f46a513..df3c8c7e0dc53b 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-conda.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-conda.rst @@ -108,7 +108,6 @@ components by using: - ``libopenvino-pytorch-frontend`` - ``libopenvino-tensorflow-frontend`` - ``libopenvino-tensorflow-lite-frontend`` -- ``libopenvino-dev`` - ``libopenvino-python`` - ``libopenvino-arm-cpu-plugin`` diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-vcpkg.rst b/docs/articles_en/get-started/install-openvino/install-openvino-vcpkg.rst index af9fe85528ca5d..6d739b350f5b38 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-vcpkg.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-vcpkg.rst @@ -81,13 +81,7 @@ Now that you've installed OpenVINO Runtime, you can try the following things: * Learn more about :doc:`OpenVINO Workflow <../../../openvino-workflow>`. * To prepare your models for working with OpenVINO, see :doc:`Model Preparation <../../../openvino-workflow/model-preparation>`. -* See pre-trained deep learning models in our :doc:`Open Model Zoo <../../../documentation/legacy-features/model-zoo>`. - - .. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. - +* See pre-trained deep learning models on `Hugging Face `__. * Learn more about :doc:`Inference with OpenVINO Runtime <../../../openvino-workflow/running-inference>`. * See sample applications in :doc:`OpenVINO toolkit Samples Overview <../../../learn-openvino/openvino-samples>`. * Check out the OpenVINO `product home page `__ . diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-yum.rst b/docs/articles_en/get-started/install-openvino/install-openvino-yum.rst index 970bb47a095d5b..fc413f194a1e63 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-yum.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-yum.rst @@ -190,13 +190,7 @@ You can also try the following things: * Learn more about :doc:`OpenVINO Workflow <../../../openvino-workflow>`. * To prepare your models for working with OpenVINO, see :doc:`Model Preparation <../../../openvino-workflow/model-preparation>`. -* See pre-trained deep learning models in our :doc:`Open Model Zoo <../../../documentation/legacy-features/model-zoo>`. - - .. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. - +* See pre-trained deep learning models on `Hugging Face `__. * Learn more about :doc:`Inference with OpenVINO Runtime <../../../openvino-workflow/running-inference>`. * See sample applications in :doc:`OpenVINO toolkit Samples Overview <../../../learn-openvino/openvino-samples>`. * Take a glance at the OpenVINO `product home page `__ . diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-zypper.rst b/docs/articles_en/get-started/install-openvino/install-openvino-zypper.rst index 127b26cac0590f..bc589dfdb48a8b 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-zypper.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-zypper.rst @@ -142,13 +142,7 @@ You can also try the following things: * Learn more about :doc:`OpenVINO Workflow <../../../openvino-workflow>`. * To prepare your models for working with OpenVINO, see :doc:`Model Preparation <../../../openvino-workflow/model-preparation>`. -* See pre-trained deep learning models in our :doc:`Open Model Zoo <../../../documentation/legacy-features/model-zoo>`. - - .. important:: - - Due to the deprecation of Open Model Zoo, models in the OpenVINO IR format are now - published on `Hugging Face `__. - +* See pre-trained deep learning models on `Hugging Face `__. * Learn more about :doc:`Inference with OpenVINO Runtime <../../../openvino-workflow/running-inference>`. * See sample applications in :doc:`OpenVINO toolkit Samples Overview <../../../learn-openvino/openvino-samples>`. * Take a glance at the OpenVINO `product home page `__ . diff --git a/docs/articles_en/learn-openvino.rst b/docs/articles_en/learn-openvino.rst index 4fca64051003a7..98797c9c67c126 100644 --- a/docs/articles_en/learn-openvino.rst +++ b/docs/articles_en/learn-openvino.rst @@ -14,7 +14,7 @@ Learn OpenVINO Interactive Tutorials (Python) Sample Applications (Python & C++) - Large Language Model Inference Guide + Generative AI workflow @@ -29,5 +29,5 @@ as well as an experienced user. | :doc:`OpenVINO Samples ` | The OpenVINO samples (Python and C++) are simple console applications that show how to use specific OpenVINO API features. They can assist you in executing tasks such as loading a model, running inference, querying particular device capabilities, etc. -| :doc:`Large Language Models in OpenVINO ` +| :doc:`Generative AI workflow ` | Detailed information on how OpenVINO accelerates Generative AI use cases and what models it supports. This tutorial provides instructions for running Generative AI models using Hugging Face Optimum Intel and Native OpenVINO APIs. diff --git a/docs/articles_en/learn-openvino/interactive-tutorials-python/notebooks-installation.rst b/docs/articles_en/learn-openvino/interactive-tutorials-python/notebooks-installation.rst index eb02caa06852fd..ba7859a0c9f5d1 100644 --- a/docs/articles_en/learn-openvino/interactive-tutorials-python/notebooks-installation.rst +++ b/docs/articles_en/learn-openvino/interactive-tutorials-python/notebooks-installation.rst @@ -312,8 +312,6 @@ Installing notebooks 1. **Create a Virtual Environment** - If you already have installed *openvino-dev*, you may skip this step and proceed with the next one. - .. code-block:: sh python -m venv openvino_env @@ -364,8 +362,6 @@ Installing notebooks 1. **Create a Virtual Environment** - If you already have installed *openvino-dev*, you may skip this step and proceed with the next one. - .. code-block:: sh python3 -m venv openvino_env @@ -415,8 +411,6 @@ Installing notebooks 1. **Create a Virtual Environment** - If you already have installed *openvino-dev*, you may skip this step and proceed with the next one. - .. code-block:: sh python3 -m venv openvino_env diff --git a/docs/articles_en/learn-openvino/llm_inference_guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide.rst index 36c001c015f744..372c3b6d652bfc 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide.rst @@ -1,140 +1,94 @@ -Large Language Model Inference Guide +Generative AI workflow ======================================== .. meta:: - :description: Explore learning materials, including interactive - Python tutorials and sample console applications that explain - how to use OpenVINO features. + :description: learn how to use OpenVINO to run generative AI models. .. toctree:: :maxdepth: 1 :hidden: - Run LLMs with Optimum Intel - Run LLMs on OpenVINO GenAI Flavor - Run LLMs on Base OpenVINO + Generative Model Preparation + Inference with OpenVINO GenAI + Inference with Optimum Intel OpenVINO Tokenizers -Large Language Models (LLMs) like GPT are transformative deep learning networks capable of a -broad range of natural language tasks, from text generation to language translation. OpenVINO -optimizes the deployment of these models, enhancing their performance and integration into -various applications. This guide shows how to use LLMs with OpenVINO, from model loading and -conversion to advanced use cases. - -The advantages of using OpenVINO for LLM deployment: - -* **OpenVINO offers optimized LLM inference**: - provides a full C/C++ API, leading to faster operation than Python-based runtimes; includes a - Python API for rapid development, with the option for further optimization in C++. -* **Compatible with diverse hardware**: - supports CPUs, GPUs, and neural accelerators across ARM and x86/x64 architectures, integrated - Intel® Processor Graphics, discrete Intel® Arc™ A-Series Graphics, and discrete Intel® Data - Center GPU Flex Series; features automated optimization to maximize performance on target - hardware. -* **Requires fewer dependencies**: - than frameworks like Hugging Face and PyTorch, resulting in a smaller binary size and reduced - memory footprint, making deployments easier and updates more manageable. -* **Provides compression and precision management techniques**: - such as 8-bit and 4-bit weight compression, including embedding layers, and storage format - reduction. This includes fp16 precision for non-compressed models and int8/int4 for compressed - models, like GPTQ models from `Hugging Face `__. -* **Supports a wide range of deep learning models and architectures**: - including text, image, and audio generative models like Llama 2, MPT, OPT, Stable Diffusion, - Stable Diffusion XL. This enables the development of multimodal applications, allowing for - write-once, deploy-anywhere capabilities. -* **Enhances inference capabilities**: - fused inference primitives such as Scaled Dot Product Attention, Rotary Positional Embedding, - Group Query Attention, and Mixture of Experts. It also offers advanced features like in-place - KV-cache, dynamic quantization, KV-cache quantization and encapsulation, dynamic beam size - configuration, and speculative sampling. -* **Provides stateful model optimization**: - models from the Hugging Face Transformers are converted into a stateful form, optimizing - inference performance and memory usage in long-running text generation tasks by managing past - KV-cache tensors more efficiently internally. This feature is automatically activated for many - supported models, while unsupported ones remain stateless. Learn more about the - :doc:`Stateful models and State API <../openvino-workflow/running-inference/stateful-models>`. - -OpenVINO offers three main paths for Generative AI use cases: - -* **Hugging Face**: use OpenVINO as a backend for Hugging Face frameworks (transformers, - diffusers) through the `Optimum Intel `__ - extension. -* **OpenVINO GenAI Flavor**: use OpenVINO GenAI APIs (Python and C++). -* **Base OpenVINO**: use OpenVINO native APIs (Python and C++) with - `custom pipeline code `__. - -In both cases, the OpenVINO runtime is used for inference, and OpenVINO tools are used for -optimization. The main differences are in footprint size, ease of use, and customizability. - -The Hugging Face API is easy to learn, provides a simple interface and hides the complexity of -model initialization and text generation for a better developer experience. However, it has more -dependencies, less customization, and cannot be ported to C/C++. - -The OpenVINO GenAI Flavor reduces the complexity of LLMs implementation by -automatically managing essential tasks like the text generation loop, tokenization, -and scheduling. The Native OpenVINO API provides a more hands-on experience, -requiring manual setup of these functions. Both methods are designed to minimize dependencies -and the overall application footprint and enable the use of generative models in C++ applications. - -It is recommended to start with Hugging Face frameworks to experiment with different models and -scenarios. Then the model can be used with OpenVINO APIs if it needs to be optimized -further. Optimum Intel provides interfaces that enable model optimization (weight compression) -using `Neural Network Compression Framework (NNCF) `__, -and export models to the OpenVINO model format for use in native API applications. - -Proceed to run LLMs with: -* :doc:`Hugging Face and Optimum Intel <./llm_inference_guide/llm-inference-hf>` + +Generative AI is a specific area of Deep Learning models used for producing new and “original” +data, based on input in the form of image, sound, or natural language text. Due to their +complexity and size, generative AI pipelines are more difficult to deploy and run efficiently. +OpenVINO™ simplifies the process and ensures high-performance integrations, with the following +options: + +.. tab-set:: + + .. tab-item:: OpenVINO™ GenAI + + | - Suggested for production deployment for the supported use cases. + | - Smaller footprint and fewer dependencies. + | - More optimization and customization options. + | - Available in both Python and C++. + | - A limited set of supported use cases. + + :doc:`Install the OpenVINO GenAI package <../get-started/install-openvino/install-openvino-genai>` + and run generative models out of the box. With custom + API and tokenizers, among other components, it manages the essential tasks such as the + text generation loop, tokenization, and scheduling, offering ease of use and high + performance. + + `Check out the OpenVINO GenAI Quick-start Guide [PDF] `__ + + .. tab-item:: Hugging Face integration + + | - Suggested for prototyping and, if the use case is not covered by OpenVINO GenAI, production. + | - Bigger footprint and more dependencies. + | - Limited customization due to Hugging Face dependency. + | - Not usable for C++ applications. + | - A very wide range of supported models. + + Using Optimum Intel is a great way to experiment with different models and scenarios, + thanks to a simple interface for the popular API and infrastructure offered by Hugging Face. + It also enables weight compression with + `Neural Network Compression Framework (NNCF) `__, + as well as conversion on the fly. For integration with the final product it may offer + lower performance, though. + + + +The advantages of using OpenVINO for generative model deployment: + +| **Fewer dependencies and smaller footprint** +| Less bloated than frameworks such as Hugging Face and PyTorch, with a smaller binary size and reduced + memory footprint, makes deployments easier and updates more manageable. + +| **Compression and precision management** +| Techniques such as 8-bit and 4-bit weight compression, including embedding layers, and storage + format reduction. This includes fp16 precision for non-compressed models and int8/int4 for + compressed models, like GPTQ models from `Hugging Face `__. + +| **Enhanced inference capabilities** +| Advanced features like in-place KV-cache, dynamic quantization, KV-cache quantization and + encapsulation, dynamic beam size configuration, and speculative sampling, and more are + available. + +| **Stateful model optimization** +| Models from the Hugging Face Transformers are converted into a stateful form, optimizing + inference performance and memory usage in long-running text generation tasks by managing past + KV-cache tensors more efficiently internally. This feature is automatically activated for + many supported models, while unsupported ones remain stateless. Learn more about the + :doc:`Stateful models and State API <../openvino-workflow/running-inference/stateful-models>`. + +| **Optimized LLM inference** +| Includes a Python API for rapid development and C++ for further optimization, offering + better performance than Python-based runtimes. + + +Proceed to guides on: + * :doc:`OpenVINO GenAI Flavor <./llm_inference_guide/genai-guide>` -* :doc:`Native OpenVINO API <./llm_inference_guide/llm-inference-native-ov>` - -The table below summarizes the differences between Hugging Face and the native OpenVINO API -approaches. - -.. dropdown:: Differences between Hugging Face and the native OpenVINO API - - .. list-table:: - :widths: 20 25 55 - :header-rows: 1 - - * - - - Hugging Face through OpenVINO - - OpenVINO Native API - * - Model support - - Supports transformer-based models such as LLMs - - Supports all model architectures from most frameworks - * - APIs - - Python (Hugging Face API) - - Python, C++ (OpenVINO API) - * - Model Format - - Source Framework / OpenVINO - - Source Framework / OpenVINO - * - Inference code - - Hugging Face based - - Custom inference pipelines - * - Additional dependencies - - Many Hugging Face dependencies - - Lightweight (e.g. numpy, etc.) - * - Application footprint - - Large - - Small - * - Pre/post-processing and glue code - - Provided through high-level Hugging Face APIs - - Must be custom implemented (see OpenVINO samples and notebooks) - * - Performance - - Good, but less efficient compared to native APIs - - Inherent speed advantage with C++, but requires hands-on optimization - * - Flexibility - - Constrained to Hugging Face API - - High flexibility with Python and C++; allows custom coding - * - Learning Curve and Effort - - Lower learning curve; quick to integrate - - Higher learning curve; requires more effort in integration - * - Ideal Use Case - - Ideal for quick prototyping and Python-centric projects - - Best suited for high-performance, resource-optimized production environments - * - Model Serving - - Paid service, based on CPU/GPU usage with Hugging Face - - Free code solution, run script for own server; costs may incur for cloud services - like AWS but generally cheaper than Hugging Face rates +* :doc:`Hugging Face and Optimum Intel <./llm_inference_guide/llm-inference-hf>` +* `Generative AI with Base OpenVINO `__ + + diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst index 41e5cbb5733c58..60253779b0f3dc 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst @@ -1,4 +1,4 @@ -Run LLMs with OpenVINO GenAI Flavor on NPU +Inference with OpenVINO GenAI ========================================== .. meta:: @@ -90,6 +90,7 @@ which do not require specifying quantization parameters: | Below is a list of such models: * meta-llama/Meta-Llama-3-8B-Instruct +* meta-llama/Llama-3.1-8B * microsoft/Phi-3-mini-4k-instruct * Qwen/Qwen2-7B * mistralai/Mistral-7B-Instruct-v0.2 @@ -133,6 +134,7 @@ you need to add ``do_sample=False`` **to the** ``generate()`` **method:** int main(int argc, char* argv[]) { std::string model_path = "TinyLlama"; + ov::genai::LLMPipeline pipe(models_path, "NPU"); ov::genai::GenerationConfig config; config.do_sample=false; config.max_new_tokens=100; diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index ebd4667d544616..dbc5d3c4416cd4 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -1,4 +1,4 @@ -Run LLM Inference on OpenVINO with the GenAI Flavor +Inference with OpenVINO GenAI =============================================================================================== .. meta:: @@ -9,39 +9,328 @@ Run LLM Inference on OpenVINO with the GenAI Flavor :hidden: NPU inference of LLMs - genai-guide/genai-use-cases -This guide will show you how to integrate the OpenVINO GenAI flavor into your application, covering -loading a model and passing the input context to receive generated text. Note that the vanilla flavor of OpenVINO -will not work with these instructions, make sure to -:doc:`install OpenVINO GenAI <../../get-started/install-openvino/install-openvino-genai>`. +OpenVINO™ GenAI is a library of pipelines and methods, extending the OpenVINO runtime to work +with generative AI models more efficiently. This article provides reference code and guidance +on its usage. Note that the base OpenVINO version will not work with these instructions, +make sure to :doc:`install OpenVINO with GenAI <../../get-started/install-openvino/install-openvino-genai>`. -.. note:: +.. image:: ../../assets/images/genai_main_diagram.svg + :align: center + :alt: OpenVINO GenAI workflow diagram - The examples use the CPU as the target device, however, the GPU is also supported. - Note that for the LLM pipeline, the GPU is used only for inference, while token selection, tokenization, and - detokenization remain on the CPU, for efficiency. Tokenizers are represented as a separate model and also run - on the CPU. -1. Export an LLM model via Hugging Face Optimum-Intel. A chat-tuned TinyLlama model is used in this example: +| Here is sample code for several Generative AI use case scenarios. Note that these are very basic + examples and may need adjustments for your specific needs, like changing the inference device. +| For a more extensive instruction and additional options, see the + `step-by-step chat-bot guide <#chat-bot-use-case-step-by-step>`__ below. - .. code-block:: python +.. dropdown:: Text-to-Image Generation - optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + OpenVINO GenAI introduces ``openvino_genai.Text2ImagePipeline`` for inference of text-to-image + models such as: as Stable Diffusion 1.5, 2.1, XL, LCM, Flex, and more. + See the following usage example for reference. - *Optional*. Optimize the model: + .. tab-set:: + + .. tab-item:: Python + :sync: python + + .. tab-set:: + + .. tab-item:: text2image.py + :name: text2image + + .. code-block:: python + + import argparse + + import openvino_genai + from PIL import Image + + + def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('prompt') + args = parser.parse_args() + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.Text2ImagePipeline(args.model_dir, device) + + image_tensor = pipe.generate( + args.prompt, + width=512, + height=512, + num_inference_steps=20, + num_images_per_prompt=1) + + image = Image.fromarray(image_tensor.data[0]) + image.save("image.bmp") + + .. tab-item:: lora_text2image.py + :name: loratext2imagepy - The model is an optimized OpenVINO IR with FP16 precision. For enhanced LLM performance, - it is recommended to use lower precision for model weights, such as INT4, and to compress weights - using NNCF during model export directly: + .. code-block:: python - .. code-block:: python + import openvino as ov + import openvino_genai + + def image_write(path: str, image_tensor: ov.Tensor): + from PIL import Image + image = Image.fromarray(image_tensor.data[0]) + image.save(path) + + + def main(): + parser = argparse.ArgumentParser() + parser.add_argument('models_path') + parser.add_argument('prompt') + args, adapters = parser.parse_known_args() + + prompt = args.prompt + + device = "CPU" # GPU, NPU can be used as well + adapter_config = openvino_genai.AdapterConfig() + + # Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters: + for i in range(int(len(adapters) / 2)): + adapter = openvino_genai.Adapter(adapters[2 * i]) + alpha = float(adapters[2 * i + 1]) + adapter_config.add(adapter, alpha) + + # LoRA adapters passed to the constructor will be activated by default in next generates + pipe = openvino_genai.Text2ImagePipeline(args.models_path, device, adapters=adapter_config) + + print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp") + image = pipe.generate(prompt, + width=512, + height=896, + num_inference_steps=20, + rng_seed=42) + + image_write("lora.bmp", image) + print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp") + image = pipe.generate(prompt, + # passing adapters in generate overrides adapters set in the constructor; openvino_genai.AdapterConfig() means no adapters + adapters=openvino_genai.AdapterConfig(), + width=512, + height=896, + num_inference_steps=20, + rng_seed=42) + image_write("baseline.bmp", image) + + + For more information, refer to the + `Python sample `__ + + .. tab-item:: C++ + :sync: cpp - optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + .. tab-set:: + .. tab-item:: text2image.cpp + :name: text2imagecpp + + .. code-block:: cpp -2. Perform generation using the new GenAI API: + #include "openvino/genai/image_generation/text2image_pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU can be used as well + + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)); + + // writes `num_images_per_prompt` images by pattern name + imwrite("image_%d.bmp", image, true); + + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + .. tab-item:: lora_text2image.cpp + :name: loratext2imagecpp + + .. code-block:: cpp + + #include "openvino/genai/image_generation/text2image_pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::AdapterConfig adapter_config; + // Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters: + for(size_t i = 0; i < (argc - 3)/2; ++i) { + ov::genai::Adapter adapter(argv[3 + 2*i]); + float alpha = std::atof(argv[3 + 2*i + 1]); + adapter_config.add(adapter, alpha); + } + + // LoRA adapters passed to the constructor will be activated by default in next generates + ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); + + std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20), + ov::genai::rng_seed(42)); + imwrite("lora.bmp", image, true); + + std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; + image = pipe.generate(prompt, + ov::genai::adapters(), // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20), + ov::genai::rng_seed(42)); + imwrite("baseline.bmp", image, true); + + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + For more information, refer to the + `C++ sample `__ + + +.. dropdown:: Speech Recognition + + The application performs inference on speech recognition Whisper Models. The samples include + the ``WhisperPipeline`` class and use audio files in WAV format at a sampling rate of 16 kHz + as input. + + .. tab-set:: + + .. tab-item:: Python + :sync: cpp + + .. code-block:: python + + import openvino_genai + import librosa + + + def read_wav(filepath): + raw_speech, samplerate = librosa.load(filepath, sr=16000) + return raw_speech.tolist() + + + def infer(model_dir: str, wav_file_path: str): + device = "CPU" # GPU or NPU can be used as well. + pipe = openvino_genai.WhisperPipeline(model_dir, device) + + # The pipeline expects normalized audio with a sampling rate of 16kHz. + raw_speech = read_wav(wav_file_path) + result = pipe.generate( + raw_speech, + max_new_tokens=100, + language="<|en|>", + task="transcribe", + return_timestamps=True, + ) + + print(result) + + for chunk in result.chunks: + print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "audio_utils.hpp" + #include "openvino/genai/whisper_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (3 > argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); + } + + std::filesystem::path models_path = argv[1]; + std::string wav_file_path = argv[2]; + std::string device = "CPU"; // GPU or NPU can be used as well. + + ov::genai::WhisperPipeline pipeline(models_path, device); + + ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); + config.max_new_tokens = 100; + config.language = "<|en|>"; + config.task = "transcribe"; + config.return_timestamps = true; + + // The pipeline expects normalized audio with a sampling rate of 16kHz. + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + auto result = pipeline.generate(raw_speech, config); + + std::cout << result << "\n"; + + for (auto& chunk : *result.chunks) { + std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; + } + + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } + + For more information, refer to the + `C++ sample `__. + + +.. dropdown:: Using GenAI in Chat Scenario + + For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache + across inputs may prove beneficial. The ``start_chat`` and ``finish_chat`` chat-specific + methods are used to mark a conversation session, as shown in the samples below: .. tab-set:: @@ -50,9 +339,35 @@ will not work with these instructions, make sure to .. code-block:: python - import openvino_genai as ov_genai - pipe = ov_genai.LLMPipeline(model_path, "CPU") - print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) + import openvino_genai + + + def streamer(subword): + print(subword, end='', flush=True) + return False + + + def infer(model_dir: str): + device = 'CPU' # GPU can be used as well. + pipe = openvino_genai.LLMPipeline(model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe.generate(prompt, config, streamer) + print('\n----------') + pipe.finish_chat() + + + + For more information, refer to the + `Python sample `__. .. tab-item:: C++ :sync: cpp @@ -60,27 +375,251 @@ will not work with these instructions, make sure to .. code-block:: cpp #include "openvino/genai/llm_pipeline.hpp" - #include - int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)); + int main(int argc, char* argv[]) try { + if (2 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); + } + std::string prompt; + std::string models_path = argv[1]; + + std::string device = "CPU"; // GPU, NPU can be used as well + ov::genai::LLMPipeline pipe(models_path, device); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + std::function streamer = [](std::string word) { + std::cout << word << std::flush; + return false; + }; + + pipe.start_chat(); + std::cout << "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, config, streamer); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; } -The `LLMPipeline` is the main object used for decoding. You can construct it directly from the -folder with the converted model. It will automatically load the main model, tokenizer, detokenizer, -and the default generation configuration. -Once the model is exported from Hugging Face Optimum-Intel, it already contains all the information -necessary for execution, including the tokenizer/detokenizer and the generation config, ensuring that -its results match those generated by Hugging Face. + For more information, refer to the + `C++ sample `__ + + +.. dropdown:: Using GenAI with Vision Language Models + + OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for + inference of multimodal text-generation Vision Language Models (VLMs). + With a text prompt and an image as input, VLMPipeline can generate text using + models such as LLava or MiniCPM-V. See the chat scenario presented + in the samples below: + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import numpy as np + import openvino_genai + from PIL import Image + from openvino import Tensor + from pathlib import Path + + + def streamer(subword: str) -> bool: + print(subword, end='', flush=True) + + + def read_image(path: str) -> Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return Tensor(image_data) + + + def read_images(path: str) -> list[Tensor]: + entry = Path(path) + if entry.is_dir(): + return [read_image(str(file)) for file in sorted(entry.iterdir())] + return [read_image(path)] + + + def infer(model_dir: str, image_dir: str): + rgbs = read_images(image_dir) + device = 'CPU' # GPU can be used as well. + enable_compile_cache = dict() + if "GPU" == device: + enable_compile_cache["CACHE_DIR"] = "vlm_cache" + pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + prompt = input('question:\n') + pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer) + + while True: + try: + prompt = input("\n----------\n" + "question:\n") + except EOFError: + break + pipe.generate(prompt, generation_config=config, streamer=streamer) + pipe.finish_chat() + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "load_image.hpp" + #include + #include + + bool print_subword(std::string&& subword) { + return !(std::cout << subword << std::flush); + } + + int main(int argc, char* argv[]) try { + if (3 != argc) { + throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + } + + std::vector rgbs = utils::load_images(argv[2]); + + std::string device = "CPU"; // GPU can be used as well. + ov::AnyMap enable_compile_cache; + if ("GPU" == device) { + enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); + } + ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + + ov::genai::GenerationConfig generation_config; + generation_config.max_new_tokens = 100; + + std::string prompt; + + pipe.start_chat(); + std::cout << "question:\n"; + + std::getline(std::cin, prompt); + pipe.generate(prompt, + ov::genai::images(rgbs), + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ + + +| + + +Chat-bot use case - step by step +############################################################################################### + +This example will show you how to create a chat-bot functionality, using the ``ov_genai.LLMPipeline`` +and a chat-tuned TinyLlama model. Apart from the basic implementation, it provides additional +optimization methods. + +Although CPU is used as inference device in the samples below, you may choose GPU instead. +Note that tasks such as token selection, tokenization, and detokenization are always handled +by CPU only. Tokenizers, represented as a separate model, are also run on CPU. + +Running the model ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +You start with exporting an LLM model via Hugging Face Optimum-Intel. Note that the precision +of ``int4`` is used, instead of the original ``fp16``, for better performance. The weight +compression is done by NNCF at the model export stage. The exported model contains all the +information necessary for execution, including the tokenizer/detokenizer and the generation +config, ensuring that its results match those generated by Hugging Face. + +The `LLMPipeline` is the main object to setup the model for text generation. You can provide the +converted model to this object, specify the device for inference, and provide additional +parameters. + + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: console + + optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + + .. code-block:: python + + import openvino_genai as ov_genai + pipe = ov_genai.LLMPipeline(model_path, "CPU") + print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: console + + optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + + .. code-block:: cpp + + #include "openvino/genai/llm_pipeline.hpp" + #include + + int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)); + } + + Streaming the Output -########################### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -For more interactive UIs during generation, streaming of model output tokens is supported. See the example -below, where a lambda function outputs words to the console immediately upon generation: +For more interactive UIs during generation, you can stream output tokens. In this example, a +lambda function outputs words to the console immediately upon generation: .. tab-set:: @@ -177,12 +716,10 @@ You can also create your custom streamer for more sophisticated processing: Optimizing Generation with Grouped Beam Search -####################################################### - -Leverage grouped beam search decoding and configure generation_config for better text generation -quality and efficient batch processing in GenAI applications. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Specify generation_config to use grouped beam search: +For better text generation quality and more efficient batch processing, specify +``generation_config`` to leverage grouped beam search decoding. .. tab-set:: @@ -219,10 +756,123 @@ Specify generation_config to use grouped beam search: } +Efficient Text Generation via Speculative Decoding +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Speculative decoding (or assisted-generation) enables faster token generation +when an additional smaller draft model is used alongside the main model. This reduces the +number of infer requests to the main model, increasing performance. + +The draft model predicts the next K tokens one by one in an autoregressive manner. The main +model validates these predictions and corrects them if necessary - in case of +a discrepancy, the main model prediction is used. Then, the draft model acquires this token and +runs prediction of the next K tokens, thus repeating the cycle. + + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import openvino_genai + import queue + import threading + + def streamer(subword): + print(subword, end='', flush=True) + return False + + def infer(model_dir: str, draft_model_dir: str, prompt: str): + main_device = 'CPU' # GPU can be used as well. + draft_device = 'CPU' + + scheduler_config = openvino_genai.SchedulerConfig() + scheduler_config.cache_size = 2 + + draft_model = openvino_genai.draft_model(draft_model_dir, draft_device) + + pipe = openvino_genai.LLMPipeline(model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + config.num_assistant_tokens = 5 + + pipe.generate("The Sun is yellow because", config, streamer) + + + For more information, refer to the + `Python sample `__. + + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include + + #include "openvino/genai/llm_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (4 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + } + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + config.num_assistant_tokens = 5; + + std::string main_model_path = argv[1]; + std::string draft_model_path = argv[2]; + std::string prompt = argv[3]; + + std::string main_device = "CPU", draft_device = "CPU"; + + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.cache_size = 5; + + ov::genai::LLMPipeline pipe( + main_model_path, + main_device, + ov::genai::draft_model(draft_model_path, draft_device), + ov::genai::scheduler_config(scheduler_config)); + + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return false; + }; + + pipe.generate("The Sun is yellow because", config, streamer); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ + + + + + + + + Comparing with Hugging Face Results ####################################### -Compare and analyze results with those generated by Hugging Face models. +You can compare the results of the above example with those generated by Hugging Face models by +running the following code: .. tab-set:: @@ -250,30 +900,34 @@ Compare and analyze results with those generated by Hugging Face models. assert hf_output == ov_output -GenAI API -####################################### -OpenVINO GenAI Flavor includes the following API: -* generation_config - defines a configuration class for text generation, enabling customization of the generation process such as the maximum length of the generated text, whether to ignore end-of-sentence tokens, and the specifics of the decoding strategy (greedy, beam search, or multinomial sampling). -* llm_pipeline - provides classes and utilities for text generation, including a pipeline for processing inputs, generating text, and managing outputs with configurable options. -* streamer_base - an abstract base class for creating streamers. -* tokenizer - the tokenizer class for text encoding and decoding. +GenAI API +####################################### + +The use case described here regards the following OpenVINO GenAI API classes: -* visibility - controls the visibility of the GenAI library. +* generation_config - defines a configuration class for text generation, + enabling customization of the generation process such as the maximum length of + the generated text, whether to ignore end-of-sentence tokens, and the specifics + of the decoding strategy (greedy, beam search, or multinomial sampling). +* llm_pipeline - provides classes and utilities for processing inputs, + text generation, and managing outputs with configurable options. +* streamer_base - an abstract base class for creating streamers. +* tokenizer - the tokenizer class for text encoding and decoding. -Learn more in the `GenAI API reference `__. +Learn more from the `GenAI API reference `__. Additional Resources #################### * `OpenVINO GenAI Repo `__ * `OpenVINO GenAI Samples `__ +* A Jupyter notebook demonstrating + `Visual-language assistant with MiniCPM-V2 and OpenVINO `__ * `OpenVINO Tokenizers `__ * `Neural Network Compression Framework `__ - - diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst deleted file mode 100644 index 6033bd8ed96106..00000000000000 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst +++ /dev/null @@ -1,426 +0,0 @@ -GenAI Use Cases -===================== - -This article provides several use case scenarios for Generative AI model -inference. The applications presented in the code samples below -only require minimal configuration, like setting an inference device. Feel free -to explore and modify the source code as you need. - - -Using GenAI for Text-to-Image Generation -######################################## - -Examples below demonstrate inference on text-to-image models, like Stable Diffusion -1.5, 2.1, and LCM, with a text prompt as input. The :ref:`main.cpp ` -sample shows basic usage of the ``Text2ImagePipeline`` pipeline. -:ref:`lora.cpp ` shows how to apply LoRA adapters to the pipeline. - - -.. tab-set:: - - .. tab-item:: Python - :sync: python - - .. tab-set:: - - .. tab-item:: main.py - :name: mainpy - - .. code-block:: python - - import openvino_genai - from PIL import Image - import numpy as np - - class Generator(openvino_genai.Generator): - def __init__(self, seed, mu=0.0, sigma=1.0): - openvino_genai.Generator.__init__(self) - np.random.seed(seed) - self.mu = mu - self.sigma = sigma - - def next(self): - return np.random.normal(self.mu, self.sigma) - - - def infer(model_dir: str, prompt: str): - device = 'CPU' # GPU can be used as well - random_generator = Generator(42) - pipe = openvino_genai.Text2ImagePipeline(model_dir, device) - image_tensor = pipe.generate( - prompt, - width=512, - height=512, - num_inference_steps=20, - num_images_per_prompt=1, - random_generator=random_generator - ) - - image = Image.fromarray(image_tensor.data[0]) - image.save("image.bmp") - - .. tab-item:: LoRA.py - :name: lorapy - - .. code-block:: python - - import openvino as ov - import openvino_genai - import numpy as np - import sys - - - class Generator(openvino_genai.Generator): - def __init__(self, seed, mu=0.0, sigma=1.0): - openvino_genai.Generator.__init__(self) - np.random.seed(seed) - self.mu = mu - self.sigma = sigma - - def next(self): - return np.random.normal(self.mu, self.sigma) - - - def image_write(path: str, image_tensor: ov.Tensor): - from PIL import Image - image = Image.fromarray(image_tensor.data[0]) - image.save(path) - - - def infer(models_path: str, prompt: str): - prompt = "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" - - device = "CPU" # GPU, NPU can be used as well - adapter_config = openvino_genai.AdapterConfig() - - for i in range(int(len(adapters) / 2)): - adapter = openvino_genai.Adapter(adapters[2 * i]) - alpha = float(adapters[2 * i + 1]) - adapter_config.add(adapter, alpha) - - pipe = openvino_genai.Text2ImagePipeline(models_path, device, adapters=adapter_config) - print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp") - image = pipe.generate(prompt, - random_generator=Generator(42), - width=512, - height=896, - num_inference_steps=20) - - image_write("lora.bmp", image) - print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp") - image = pipe.generate(prompt, - adapters=openvino_genai.AdapterConfig(), - random_generator=Generator(42), - width=512, - height=896, - num_inference_steps=20 - ) - image_write("baseline.bmp", image) - - For more information, refer to the - `Python sample `__ - - .. tab-item:: C++ - :sync: cpp - - .. tab-set:: - - .. tab-item:: main.cpp - :name: maincpp - - .. code-block:: cpp - - #include "openvino/genai/text2image/pipeline.hpp" - - #include "imwrite.hpp" - - int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::Text2ImagePipeline pipe(models_path, device); - ov::Tensor image = pipe.generate(prompt, - ov::genai::width(512), - ov::genai::height(512), - ov::genai::num_inference_steps(20), - ov::genai::num_images_per_prompt(1)); - - imwrite("image_%d.bmp", image, true); - - return EXIT_SUCCESS; - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - .. tab-item:: LoRA.cpp - :name: loracpp - - .. code-block:: cpp - - #include "openvino/genai/text2image/pipeline.hpp" - - #include "imwrite.hpp" - - int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::AdapterConfig adapter_config; - for(size_t i = 0; i < (argc - 3)/2; ++i) { - ov::genai::Adapter adapter(argv[3 + 2*i]); - float alpha = std::atof(argv[3 + 2*i + 1]); - adapter_config.add(adapter, alpha); - } - - ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); - - std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; - ov::Tensor image = pipe.generate(prompt, - ov::genai::random_generator(std::make_shared(42)), - ov::genai::width(512), - ov::genai::height(896), - ov::genai::num_inference_steps(20)); - imwrite("lora.bmp", image, true); - - std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; - image = pipe.generate(prompt, - ov::genai::adapters(), - ov::genai::random_generator(std::make_shared(42)), - ov::genai::width(512), - ov::genai::height(896), - ov::genai::num_inference_steps(20)); - imwrite("baseline.bmp", image, true); - - return EXIT_SUCCESS; - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - - - - - -Using GenAI in Speech Recognition -################################# - - -The application, shown in code samples below, performs inference on speech -recognition Whisper Models. The samples include the ``WhisperPipeline`` class -and use audio files in WAV format at a sampling rate of 16 kHz as input. - -.. tab-set:: - - .. tab-item:: Python - :sync: cpp - - .. code-block:: python - - import openvino_genai - import librosa - - - def read_wav(filepath): - raw_speech, samplerate = librosa.load(filepath, sr=16000) - return raw_speech.tolist() - - - def infer(model_dir: str, wav_file_path: str): - device = "CPU" # GPU or NPU can be used as well. - pipe = openvino_genai.WhisperPipeline(model_dir, device) - - # The pipeline expects normalized audio with a sampling rate of 16kHz. - raw_speech = read_wav(wav_file_path) - result = pipe.generate( - raw_speech, - max_new_tokens=100, - language="<|en|>", - task="transcribe", - return_timestamps=True, - ) - - print(result) - - for chunk in result.chunks: - print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "audio_utils.hpp" - #include "openvino/genai/whisper_pipeline.hpp" - - int main(int argc, char* argv[]) try { - if (3 > argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); - } - - std::filesystem::path models_path = argv[1]; - std::string wav_file_path = argv[2]; - std::string device = "CPU"; // GPU or NPU can be used as well. - - ov::genai::WhisperPipeline pipeline(models_path, device); - - ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); - config.max_new_tokens = 100; - config.language = "<|en|>"; - config.task = "transcribe"; - config.return_timestamps = true; - - // The pipeline expects normalized audio with a sampling rate of 16kHz. - ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); - auto result = pipeline.generate(raw_speech, config); - - std::cout << result << "\n"; - - for (auto& chunk : *result.chunks) { - std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; - } - - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) { - } - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) { - } - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__. - - -Using GenAI in Chat Scenario -############################ - -For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache across inputs -may prove beneficial. The ``start_chat`` and ``finish_chat`` chat-specific methods are used to -mark a conversation session, as shown in the samples below: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: python - - import openvino_genai - - - def streamer(subword): - print(subword, end='', flush=True) - return False - - - def infer(model_dir: str): - device = 'CPU' # GPU can be used as well. - pipe = openvino_genai.LLMPipeline(model_dir, device) - - config = openvino_genai.GenerationConfig() - config.max_new_tokens = 100 - - pipe.start_chat() - while True: - try: - prompt = input('question:\n') - except EOFError: - break - pipe.generate(prompt, config, streamer) - print('\n----------') - pipe.finish_chat() - - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "openvino/genai/llm_pipeline.hpp" - - int main(int argc, char* argv[]) try { - if (2 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); - } - std::string prompt; - std::string models_path = argv[1]; - - std::string device = "CPU"; // GPU, NPU can be used as well - ov::genai::LLMPipeline pipe(models_path, device); - - ov::genai::GenerationConfig config; - config.max_new_tokens = 100; - std::function streamer = [](std::string word) { - std::cout << word << std::flush; - return false; - }; - - pipe.start_chat(); - std::cout << "question:\n"; - while (std::getline(std::cin, prompt)) { - pipe.generate(prompt, config, streamer); - std::cout << "\n----------\n" - "question:\n"; - } - pipe.finish_chat(); - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - -Additional Resources -##################### - -* :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>` -* `OpenVINO GenAI Repo `__ -* `OpenVINO GenAI Samples `__ -* `OpenVINO Tokenizers `__ diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst new file mode 100644 index 00000000000000..e6d15675ea45b8 --- /dev/null +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst @@ -0,0 +1,159 @@ +Generative Model Preparation +=============================================================================== + +.. meta:: + :description: Learn how to use Hugging Face Hub and Optimum Intel APIs to + prepare generative models for inference. + + + +Since generative AI models tend to be big and resource-heavy, it is advisable to +optimize them for efficient inference. This article will show how to prepare +LLM models for inference with OpenVINO by: + +* `Downloading Models from Hugging Face <#download-generative-models-from-hugging-face-hub>`__ +* `Downloading Models from Model Scope <#download-generative-models-from-model-scope>`__ +* `Converting and Optimizing Generative Models <#convert-and-optimize-generative-models>`__ + + + +Download Generative Models From Hugging Face Hub +############################################################################### + +Pre-converted and pre-optimized models are available in the `OpenVINO Toolkit `__ +organization, under the `model section `__, or under +different model collections: + +* `LLM: `__ +* `Speech-to-Text `__ +* `Speculative Decoding Draft Models `__ + +You can also use the **huggingface_hub** package to download models: + +.. code-block:: console + + pip install huggingface_hub + huggingface-cli download "OpenVINO/phi-2-fp16-ov" --local-dir model_path + + +The models can be used in OpenVINO immediately after download. No dependencies +are required except **huggingface_hub**. + + +Download Generative Models From Model Scope +############################################################################### + +To download models from `Model Scope `__, +use the **modelscope** package: + +.. code-block:: console + + pip install modelscope + modelscope download --model "Qwen/Qwen2-7b" --local_dir model_path + +Models downloaded via Model Scope are available in Pytorch format only and they must +be :doc:`converted to OpenVINO IR <../../openvino-workflow/model-preparation/convert-model-to-ir>` +before inference. + +Convert and Optimize Generative Models +############################################################################### + +OpenVINO works best with models in the OpenVINO IR format, both in full precision and quantized. +If your selected model has not been pre-optimized, you can easily do it yourself, using a single +**optimum-cli** command. For that, make sure optimum-intel is installed on your system: + +.. code-block:: console + + pip install optimum-intel[openvino] + + +While optimizing models, you can decide to keep the original precision or select one that is lower. + +.. tab-set:: + + .. tab-item:: Keeping full model precision + :sync: full-precision + + .. code-block:: console + + optimum-cli export openvino --model --weight-format fp16 + + Examples: + + .. tab-set:: + + .. tab-item:: LLM (text generation) + :sync: llm-text-gen + + .. code-block:: console + + optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 ov_llama_2 + + .. tab-item:: Diffusion models (text2image) + :sync: diff-text-img + + .. code-block:: console + + optimum-cli export openvino --model stabilityai/stable-diffusion-xl-base-1.0 --weight-format fp16 ov_SDXL + + .. tab-item:: VLM (Image processing): + :sync: vlm-img-proc + + .. code-block:: console + + optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code –weight-format fp16 ov_MiniCPM-V-2_6 + + .. tab-item:: Whisper models (speech2text): + :sync: whisp-speech-txt + + .. code-block:: console + + optimum-cli export openvino --trust-remote-code --model openai/whisper-base ov_whisper + + .. tab-item:: Exporting to selected precision + :sync: low-precision + + .. code-block:: console + + optimum-cli export openvino --model --weight-format int4 + + Examples: + + .. tab-set:: + + .. tab-item:: LLM (text generation) + :sync: llm-text-gen + + .. code-block:: console + + optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format int4 ov_llama_2 + + .. tab-item:: Diffusion models (text2image) + :sync: diff-text-img + + .. code-block:: console + + optimum-cli export openvino --model stabilityai/stable-diffusion-xl-base-1.0 --weight-format int4 ov_SDXL + + .. tab-item:: VLM (Image processing) + :sync: vlm-img-proc + + .. code-block:: console + + optimum-cli export openvino -m model_path --task text-generation-with-past --weight-format int4 ov_MiniCPM-V-2_6 + + +.. note:: + + Any other ``model_id``, for example ``openbmb/MiniCPM-V-2_6``, or the path + to a local model file can be used. + + Also, you can specify different data type like ``int8``. + + +Additional Resources +############################################################################### + +* `Full set of optimum-cli parameters `__ +* :doc:`Model conversion in OpenVINO <../../openvino-workflow/model-preparation/convert-model-to-ir>` +* :doc:`Model optimization in OpenVINO <../../openvino-workflow/model-optimization>` diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst index a26b670b5314d0..4fec1acd23e6a7 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst @@ -1,4 +1,4 @@ -Run LLMs with Hugging Face and Optimum Intel +Inference with Optimum Intel =============================================================================================== .. meta:: @@ -276,9 +276,10 @@ includes **Dynamic quantization** of activations of 4/8-bit quantized MatMuls an ov_config={"KV_CACHE_PRECISION": "u8", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"} ) -.. note:: + .. note:: + Currently, for KV-cache quantization, GPU ignores the DYNAMIC_QUANTIZATION_GROUP_SIZE property, using ``group_size = head_size``. Additionally, it does not support the ``get_state()`` and ``set_state()`` APIs when KV-cache quantization is enabled. - Currently, both Dynamic quantization and KV-cache quantization are available for CPU device. + For GPU, KV-cache quantization is enabled by default on platforms without XMX support, and can be disabled by setting KV_CACHE_PRECISION to ``undefined``. Working with Models Tuned with LoRA diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst deleted file mode 100644 index 2476a0423e30e1..00000000000000 --- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst +++ /dev/null @@ -1,192 +0,0 @@ -Run LLM Inference on Native OpenVINO (not recommended) -=============================================================================================== - -To run Generative AI models using native OpenVINO APIs you need to follow regular -**Convert -> Optimize -> Deploy** path with a few simplifications. - -To convert a model from `Hugging Face `__, you can use -Optimum-Intel export feature that allows you to export model in the OpenVINO format without -invoking conversion API and tools directly. In this case, the conversion process is a bit -more simplified. You can still use a regular conversion path if the model comes from -outside of Hugging Face ecosystem, i.e., in source framework format (PyTorch, etc.) - -Model optimization can be performed within Hugging Face or directly using NNCF as described in -:doc:`Weight Compression <../../openvino-workflow/model-optimization-guide/weight-compression>`. - -.. note:: - - It is recommended to use models in 4-bit precision, as maintaining the model in its - original precision may result in significantly decreased performance. - -Inference code that uses native API cannot benefit from Hugging Face pipelines. -You need to write your custom code or take it from the available examples. Below are -some examples of popular Generative AI scenarios: - -* In case of LLMs for text generation, you need to handle tokenization, inference and - token selection loop, and de-tokenization. If token selection involves beam search, - it also needs to be written. -* For image generation models, you need to make a pipeline that includes several model - inferences: inference for source (e.g., text) encoder models, inference loop for - diffusion process and inference for the decoding part. Scheduler code is also required. - -To write such pipelines, you can follow the examples provided as part of OpenVINO: - -* `OpenVINO Latent Consistency Model C++ image generation pipeline `__ -* `OpenVINO Stable Diffusion (with LoRA) C++ image generation pipeline `__ - -To perform inference, models must be first converted to OpenVINO IR format using -Hugging Face Optimum-Intel API. - -An inference pipeline for a text generation LLM is set up in the following stages: - -1. Read and compile the model in OpenVINO IR. -2. Pre-process text prompt with a tokenizer and set the result as model inputs. -3. Run token generation loop. -4. De-tokenize outputs. - -Prerequisites -######################## - -Linux operating system (as of the current version). - -**Installation** - -1. Create a virtual environment - - .. code-block:: python - - python -m venv openvino_llm - - ``openvino_llm`` is an example name; you can choose any name for your environment. - -2. Activate the virtual environment - - .. code-block:: python - - source openvino_llm/bin/activate - -3. Install OpenVINO tokenizers and dependencies - - .. code-block:: python - - pip install optimum[openvino] - - -Convert Hugging Face tokenizer and model to OpenVINO IR format -++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -**Convert Tokenizer** - -`OpenVINO Tokenizers `__ -come equipped with a CLI tool that facilitates the conversion of tokenizers -from either the Hugging Face Hub or those saved locally to the OpenVINO IR format: - -.. code-block:: python - - convert_tokenizer microsoft/Llama2-7b-WhoIsHarryPotter --with-detokenizer -o openvino_tokenizer - -In this example, the ``microsoft/Llama2-7b-WhoIsHarryPotter tokenizer`` is transformed from the Hugging -Face hub. You can substitute this tokenizer with one of your preference. You can also rename -the output directory (``openvino_tokenizer``). - -**Convert Model** - -The optimum-cli command can be used for converting a Hugging Face model to the OpenVINO IR model format. -Learn more in Loading an LLM with OpenVINO. - -.. code-block:: python - - optimum-cli export openvino --convert-tokenizer --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 openvino_model - -Full OpenVINO Text Generation Pipeline -###################################################################### - -1. Import and Compile Models -+++++++++++++++++++++++++++++++++++++++ - -Use the model and tokenizer converted from the previous step: - -.. code-block:: python - - import numpy as np - from openvino import compile_model - - # Compile the tokenizer, model, and detokenizer using OpenVINO. These files are XML representations of the models optimized for OpenVINO - compiled_tokenizer = compile_model("openvino_tokenizer.xml") - compiled_model = compile_model("openvino_model.xml") - compiled_detokenizer = compile_model("openvino_detokenizer.xml") - -2. Tokenize and Transform Input -+++++++++++++++++++++++++++++++++++++++ - -Tokenization is a mandatory step in the process of generating text using LLMs. Tokenization -converts the input text into a sequence of tokens, which are essentially the format that the -model can understand and process. The input text string must be tokenized and set up in the -structure expected by the model before running inference. - -.. code-block:: python - - text_input = ["Quick brown fox was"] - ov_input = compiled_tokenizer(text_input) - -3. Generate Tokens -+++++++++++++++++++++++++++++++++++++++ - -The core of text generation lies in the inference and token selection loop. In each iteration -of this loop, the model runs inference on the input sequence, generates and selects a new token, -and appends it to the existing sequence. - -.. code-block:: python - - # Define the number of new tokens to generate - new_tokens_size = 10 - - # Determine the size of the existing prompt - prompt_size = ov_input["input_ids"].shape[-1] - - # Prepare the input dictionary for the model - # It combines existing tokens with additional space for new tokens - input_dict = { - output.any_name: np.hstack([tensor, np.zeros(shape=(1, new_tokens_size), dtype=np.int_)]) - for output, tensor in ov_input.items() - } - - # Generate new tokens iteratively - for idx in range(prompt_size, prompt_size + new_tokens_size): - # Get output from the model - output = compiled_model(input_dict)["token_ids"] - # Update the input_ids with newly generated token - input_dict["input_ids"][:, idx] = output[:, idx - 1] - # Update the attention mask to include the new token - input_dict["attention_mask"][:, idx] = 1 - -4. Decode and Display Output -+++++++++++++++++++++++++++++++++++++++ - -The final step in the process is de-tokenization, where the sequence of token IDs generated by -the model is converted back into human-readable text. -This step is essential for interpreting the model's output. - -.. code-block:: python - - # Extract token IDs for the final output - ov_token_ids = input_dict["input_ids"] - # Decode the model output back to string - ov_output = compiled_detokenizer(ov_token_ids)["string_output"] - print(f"OpenVINO output string: `{ov_output}`") - -.. code-block:: python - - # Example output: - [' Quick brown fox was walking through the forest. He was looking for something'] - - -Additional Resources -#################### - -* `OpenVINO GenAI Repo `__ -* `OpenVINO Tokenizers `__ -* `Neural Network Compression Framework `__ -* :doc:`Stateful Models Low-Level Details <../../openvino-workflow/running-inference/stateful-models>` -* :doc:`Working with Textual Data <../../openvino-workflow/running-inference/string-tensors>` - diff --git a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst index 390fe00605f2c6..8ab8a43031ca39 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst @@ -30,7 +30,7 @@ Basic Usage The benchmarking application works with models in the OpenVINO IR (``model.xml`` and ``model.bin``) and ONNX (``model.onnx``) formats. - Make sure to :doc:`convert your models <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` + Make sure to :doc:`convert your models <../../openvino-workflow/model-preparation/convert-model-to-ir>` if necessary. To run benchmarking with default options on a model, use the following command: @@ -56,7 +56,7 @@ Basic Usage The benchmarking application works with models in the OpenVINO IR, TensorFlow, TensorFlow Lite, PaddlePaddle, PyTorch and ONNX formats. If you need it, - OpenVINO also allows you to :doc:`convert your models <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>`. + OpenVINO also allows you to :doc:`convert your models <../../openvino-workflow/model-preparation/convert-model-to-ir>`. To run benchmarking with default options on a model, use the following command: @@ -937,4 +937,4 @@ Additional Resources - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` diff --git a/docs/articles_en/learn-openvino/openvino-samples/bert-benchmark.rst b/docs/articles_en/learn-openvino/openvino-samples/bert-benchmark.rst index 92f6a410219f43..13f18fc3272b34 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/bert-benchmark.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/bert-benchmark.rst @@ -7,8 +7,7 @@ Bert Benchmark Python Sample This sample demonstrates how to estimate performance of a Bert model using Asynchronous -Inference Request API. Unlike `demos `__ -this sample does not have +Inference Request API. This sample does not have configurable command line arguments. Feel free to modify sample's source code to try out different options. @@ -64,5 +63,5 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `Bert Benchmark Python Sample on Github `__ diff --git a/docs/articles_en/learn-openvino/openvino-samples/hello-classification.rst b/docs/articles_en/learn-openvino/openvino-samples/hello-classification.rst index f8222e495c7387..7a9a7d449d628d 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/hello-classification.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/hello-classification.rst @@ -93,11 +93,11 @@ To run the sample, you need to specify a model and an image: to manually rearrange the default channels order in the sample or demo application or reconvert your model using model conversion API with ``reverse_input_channels`` argument specified. For more information about - the argument, refer to **When to Reverse Input Channels** section of - :doc:`Embedding Preprocessing Computation <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes>`. + the argument, refer to the **Color Conversion** section of + :doc:`Preprocessing API <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/preprocessing-api-details>`. - Before running the sample with a trained model, make sure the model is converted to the intermediate representation (IR) format (\*.xml + \*.bin) - using the :doc:`model conversion API <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>`. + using the :doc:`model conversion API <../../openvino-workflow/model-preparation/convert-model-to-ir>`. - The sample accepts models in ONNX format (.onnx) that do not require preprocessing. - The sample supports NCHW model layout only. @@ -257,7 +257,7 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `OpenVINO Runtime C API `__ - `Hello Classification Python Sample on Github `__ - `Hello Classification C++ Sample on Github `__ diff --git a/docs/articles_en/learn-openvino/openvino-samples/hello-nv12-input-classification.rst b/docs/articles_en/learn-openvino/openvino-samples/hello-nv12-input-classification.rst index 19219070cbfbe2..3d1c069e2c8cb1 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/hello-nv12-input-classification.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/hello-nv12-input-classification.rst @@ -95,11 +95,11 @@ the following command, you can convert an ordinary image to an uncompressed NV12 - By default, this sample expects that model input has BGR channels order. If you trained your model to work with RGB order, you need to reconvert your model using model conversion API with ``reverse_input_channels`` argument - specified. For more information about the argument, refer to **When to Reverse - Input Channels** section of :doc:`Embedding Preprocessing Computation <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes>`. + specified. For more information about the argument, refer to the + **Color Conversion** section of :doc:`Preprocessing API <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/preprocessing-api-details>`. - Before running the sample with a trained model, make sure the model is converted to the intermediate representation (IR) format (\*.xml + \*.bin) - using the :doc:`model conversion API <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>`. + using the :doc:`model conversion API <../../openvino-workflow/model-preparation/convert-model-to-ir>`. - The sample accepts models in ONNX format (.onnx) that do not require preprocessing. Example @@ -208,7 +208,7 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `API Reference `__ - `Hello NV12 Input Classification C++ Sample on Github `__ - `Hello NV12 Input Classification C Sample on Github `__ diff --git a/docs/articles_en/learn-openvino/openvino-samples/hello-reshape-ssd.rst b/docs/articles_en/learn-openvino/openvino-samples/hello-reshape-ssd.rst index 23de8eb1979824..0e929bb5ed2701 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/hello-reshape-ssd.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/hello-reshape-ssd.rst @@ -14,8 +14,8 @@ using the sample, refer to the following requirements: - Models with only one input and output are supported. - The sample accepts any file format supported by ``core.read_model``. -- The sample has been validated with: `person-detection-retail-0013 `__ - models and the NCHW layout format. +- The sample has been validated with the person-detection-retail-0013 + model and the NCHW layout format. - To build the sample, use instructions available at :ref:`Build the Sample Applications ` section in "Get Started with Samples" guide. @@ -82,12 +82,12 @@ To run the sample, you need to specify a model and an image: order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using model conversion API with ``reverse_input_channels`` - argument specified. For more information about the argument, refer to - **When to Reverse Input Channels** section of - :doc:`Embedding Preprocessing Computation <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes>`. + argument specified. For more information about the argument, refer to the + **Color Conversion** section of + :doc:`Preprocessing API <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/preprocessing-api-details>`. - Before running the sample with a trained model, make sure the model is converted to the intermediate representation (IR) format (\*.xml + \*.bin) - using :doc:`model conversion API <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>`. + using :doc:`model conversion API <../../openvino-workflow/model-preparation/convert-model-to-ir>`. - The sample accepts models in ONNX format (.onnx) that do not require preprocessing. Example @@ -204,7 +204,7 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `Hello Reshape SSD Python Sample on Github `__ - `Hello Reshape SSD C++ Sample on Github `__ diff --git a/docs/articles_en/learn-openvino/openvino-samples/image-classification-async.rst b/docs/articles_en/learn-openvino/openvino-samples/image-classification-async.rst index b112452e932c72..d88b950463210d 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/image-classification-async.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/image-classification-async.rst @@ -129,9 +129,9 @@ To run the sample, you need to specify a model and an image: .. note:: - - By default, OpenVINO™ Toolkit Samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using model conversion API with ``reverse_input_channels`` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of :doc:`Embedding Preprocessing Computation <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes>`. + - By default, OpenVINO™ Toolkit Samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using model conversion API with ``reverse_input_channels`` argument specified. For more information about the argument, refer to the **Color Conversion** section of :doc:`Preprocessing API <../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/preprocessing-api-details>`. - - Before running the sample with a trained model, make sure the model is converted to the intermediate representation (IR) format (\*.xml + \*.bin) using :doc:`model conversion API <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>`. + - Before running the sample with a trained model, make sure the model is converted to the intermediate representation (IR) format (\*.xml + \*.bin) using :doc:`model conversion API <../../openvino-workflow/model-preparation/convert-model-to-ir>`. - The sample accepts models in ONNX format (.onnx) that do not require preprocessing. @@ -326,6 +326,6 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO™ Toolkit Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `Image Classification Async Python Sample on Github `__ - `Image Classification Async C++ Sample on Github `__ diff --git a/docs/articles_en/learn-openvino/openvino-samples/model-creation.rst b/docs/articles_en/learn-openvino/openvino-samples/model-creation.rst index e0e3034c225763..ad01cee53a69b1 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/model-creation.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/model-creation.rst @@ -76,7 +76,7 @@ To run the sample, you need to specify model weights and a device. - This sample supports models with FP32 weights only. - The ``lenet.bin`` weights file is generated by - :doc:`model conversion API <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` + :doc:`model conversion API <../../openvino-workflow/model-preparation/convert-model-to-ir>` from the public LeNet model, with the ``input_shape [64,1,28,28]`` parameter specified. - The original model is available in the `Caffe repository `__ on GitHub. @@ -292,6 +292,6 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `Model Creation Python Sample on Github `__ - `Model Creation C++ Sample on Github `__ diff --git a/docs/articles_en/learn-openvino/openvino-samples/sync-benchmark.rst b/docs/articles_en/learn-openvino/openvino-samples/sync-benchmark.rst index 245672decb7ab2..ccaa1f03a35552 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/sync-benchmark.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/sync-benchmark.rst @@ -8,15 +8,13 @@ Sync Benchmark Sample This sample demonstrates how to estimate performance of a model using Synchronous Inference Request API. It makes sense to use synchronous inference only in latency -oriented scenarios. Models with static input shapes are supported. Unlike -`demos `__ -this sample does not have other configurable command-line +oriented scenarios. Models with static input shapes are supported. +This sample does not have other configurable command-line arguments. Feel free to modify sample's source code to try out different options. Before using the sample, refer to the following requirements: - The sample accepts any file format supported by ``core.read_model``. -- The sample has been validated with: `yolo-v3-tf `__, - `face-detection-0200 `__ models. +- The sample has been validated with: the yolo-v3-tf and face-detection-0200 models. - To build the sample, use instructions available at :ref:`Build the Sample Applications ` section in "Get Started with Samples" guide. @@ -167,6 +165,6 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `Sync Benchmark Python Sample on Github `__ - `Sync Benchmark C++ Sample on Github `__ diff --git a/docs/articles_en/learn-openvino/openvino-samples/throughput-benchmark.rst b/docs/articles_en/learn-openvino/openvino-samples/throughput-benchmark.rst index e8b723afd2a480..4632fab82bd0ea 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/throughput-benchmark.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/throughput-benchmark.rst @@ -7,7 +7,7 @@ Throughput Benchmark Sample This sample demonstrates how to estimate performance of a model using Asynchronous -Inference Request API in throughput mode. Unlike `demos `__ this sample +Inference Request API in throughput mode. This sample does not have other configurable command-line arguments. Feel free to modify sample's source code to try out different options. @@ -18,8 +18,7 @@ sets ``uint8``, while the sample uses default model precision which is usually ` Before using the sample, refer to the following requirements: - The sample accepts any file format supported by ``core.read_model``. -- The sample has been validated with: `yolo-v3-tf `__, - `face-detection-0200 `__ models. +- The sample has been validated with: yolo-v3-tf and face-detection-0200 models. - To build the sample, use instructions available at :ref:`Build the Sample Applications ` section in "Get Started with Samples" guide. @@ -171,6 +170,6 @@ Additional Resources - :doc:`Integrate the OpenVINO™ Runtime with Your Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>` - :doc:`Get Started with Samples ` - :doc:`Using OpenVINO Samples <../openvino-samples>` -- :doc:`Convert a Model <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` +- :doc:`Convert a Model <../../openvino-workflow/model-preparation/convert-model-to-ir>` - `Throughput Benchmark Python Sample on Github `__ - `Throughput Benchmark C++ Sample on Github `__ diff --git a/docs/articles_en/openvino-workflow/model-preparation.rst b/docs/articles_en/openvino-workflow/model-preparation.rst index c23540874e9b7a..33a4d8a54cc7f6 100644 --- a/docs/articles_en/openvino-workflow/model-preparation.rst +++ b/docs/articles_en/openvino-workflow/model-preparation.rst @@ -56,12 +56,6 @@ The easiest way to obtain a model is to download it from an online database, suc .. note:: - Model conversion API prior to OpenVINO 2023.1 is considered deprecated. Existing and new - projects are recommended to transition to the new solutions, keeping in mind that they are - not fully backwards compatible with ``openvino.tools.mo.convert_model`` or the ``mo`` - CLI tool. For more details, see the - :doc:`Model Conversion API Transition Guide <../documentation/legacy-features/transition-legacy-conversion-api>`. - For PyTorch and JAX/Flax models, `Python API <#convert-a-model-with-python-convert-model>`__ is the only conversion option. @@ -298,15 +292,4 @@ follow: * :doc:`Post-training optimization ` * :doc:`Model inference in OpenVINO Runtime ` -If you are still using the legacy conversion API (``mo`` or ``openvino.tools.mo.convert_model``), -refer to the following materials: - -* :doc:`Transition from legacy mo and ov.tools.mo.convert_model <../documentation/legacy-features/transition-legacy-conversion-api>` -* :doc:`Legacy Model Conversion API <../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api>` - - - - .. need to investigate python api article generation - api/ie_python_api/_autosummary/openvino.Model.html does not exist, api/ie_python_api/_autosummary/openvino.runtime.Model.html does. - - diff --git a/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst b/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst index 560b013301e064..dd2fc35c56e92b 100644 --- a/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst +++ b/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst @@ -296,7 +296,7 @@ used by OpenVINO, typically obtained by converting models of supported framework * The ``convert_model()`` method: - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can + You can use ``ovc`` to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. .. dropdown:: List of supported formats: @@ -423,7 +423,7 @@ used by OpenVINO, typically obtained by converting models of supported framework * The ``convert_model()`` method: - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR + You can use ``ovc`` to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. .. dropdown:: List of supported formats: @@ -557,7 +557,7 @@ used by OpenVINO, typically obtained by converting models of supported framework * The ``convert_model()`` method: - You can use ``mo`` command-line tool to convert a model to IR. The obtained IR + You can use ``ovc`` to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. .. dropdown:: List of supported formats: @@ -708,6 +708,6 @@ multiple times: Additional Resources #################### -* :doc:`Transition guide from the legacy to new conversion API <../../documentation/legacy-features/transition-legacy-conversion-api>` +* Learn about the :doc:`parameters to adjust model conversion <./conversion-parameters>`. * `Download models from Hugging Face `__. diff --git a/docs/articles_en/openvino-workflow/running-inference/dynamic-shapes.rst b/docs/articles_en/openvino-workflow/running-inference/dynamic-shapes.rst index 9de4ba9df18827..b9978f3767562e 100644 --- a/docs/articles_en/openvino-workflow/running-inference/dynamic-shapes.rst +++ b/docs/articles_en/openvino-workflow/running-inference/dynamic-shapes.rst @@ -139,7 +139,7 @@ To check if a model already has dynamic dimensions, first load it with the ``rea If the input model already has dynamic dimensions, that will not change during inference. If the inputs will not be used dynamically, it is recommended to set them to static values using the ``reshape`` method to save application memory and potentially improve inference speed. The OpenVINO API supports any combination of static and dynamic dimensions. -Static and dynamic dimensions can also be set when converting the model with ``convert_model()``. It has identical capabilities to the ``reshape`` method, so you can save time by converting the model with dynamic shapes beforehand rather than in the application code. To get information about setting input shapes using ``convert_model()``, refer to :doc:`Setting Input Shapes <../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-setting-input-shapes>`. +Static and dynamic dimensions can also be set when converting the model with ``convert_model()``. It has identical capabilities to the ``reshape`` method, so you can save time by converting the model with dynamic shapes beforehand rather than in the application code. To get information about setting input shapes using ``convert_model()``, refer to :doc:`Setting Input Shapes <./changing-input-shape>`. Dimension Bounds ---------------- diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst index aa8e9cdabfda64..31d0af303c633a 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst @@ -31,7 +31,6 @@ different conditions: | :doc:`Automatic Device Selection (AUTO) ` | :doc:`Heterogeneous Execution (HETERO) ` | :doc:`Automatic Batching Execution (Auto-batching) ` -| :doc:`[DEPRECATED] Multi-Device Execution (MULTI) <../../documentation/legacy-features/multi-device>` To learn how to change the device configuration, read the :doc:`Query device properties article `. diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst index 6bebf087052b75..a5ab0c845dfa66 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst @@ -513,7 +513,6 @@ Additional Resources * `Automatic Device Selection with OpenVINO™ Notebook `__ * :doc:`Debugging AUTO ` -* :doc:`(LEGACY) Running on Multiple Devices Simultaneously <../../../documentation/legacy-features/multi-device>` * :doc:`Inference Devices and Modes <../inference-devices-and-modes>` diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst index b4e1c7ac15afcc..2adf3e7f9d1e4d 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst @@ -124,7 +124,7 @@ Selected precision of each primitive depends on the operation precision in IR, q The ``u1``/``u8``/``i8`` data types are used for quantized operations only, which means that they are not selected automatically for non-quantized operations. For more details on how to get a quantized model, refer to the :doc:`Model Optimization guide <../../model-optimization>`. -Floating-point precision of a GPU primitive is selected based on operation precision in the OpenVINO IR, except for the :doc:``, which is executed in the ``f16`` precision. +Floating-point precision of a GPU primitive is selected based on operation precision in the OpenVINO IR, except for the :doc:``, which is executed in the ``f16`` precision. .. note:: diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst index 7b135fa7ff0b14..2ba25507802288 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst @@ -146,6 +146,8 @@ offer a limited set of supported OpenVINO features. ov::intel_npu::turbo ov::intel_npu::tiles ov::intel_npu::max_tiles + ov::intel_npu::bypass_umd_caching + ov::intel_npu::defer_weights_load .. tab-item:: Read-only properties @@ -168,7 +170,6 @@ offer a limited set of supported OpenVINO features. ov::intel_npu::device_alloc_mem_size ov::intel_npu::device_total_mem_size ov::intel_npu::driver_version - ov::intel_npu::bypass_umd_caching .. note:: @@ -249,11 +250,11 @@ or **ov::intel_npu::max_tiles and ov::intel_npu::tiles** -the ``max_tiles`` property is read-write to enable compiling models off-device. +the ``max_tiles`` property is read-write to enable compiling models off-device. When on NPU, ``max_tiles`` will return the number of tiles the device has. Setting the number of tiles to compile for (via ``intel_npu::tiles``), when on device, -must be preceded by reading ``intel_npu::max_tiles`` first, to make sure that -``ov::intel_npu::tiles`` <= ``ov::intel_npu::max_tiles`` +must be preceded by reading ``intel_npu::max_tiles`` first, to make sure that +``ov::intel_npu::tiles`` <= ``ov::intel_npu::max_tiles`` to avoid exceptions from the compiler. .. note:: @@ -280,7 +281,3 @@ Additional Resources * `Working with NPUs in OpenVINO™ Notebook `__ * `Vision colorization Notebook <./../../../notebooks/vision-image-colorization-with-output.html>`__ -* `Classification Benchmark C++ Demo `__ -* `3D Human Pose Estimation Python Demo `__ -* `Object Detection C++ Demo `__ -* `Object Detection Python Demo `__ diff --git a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst index 6ab924a61ef150..259f605d46c2f7 100644 --- a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst +++ b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst @@ -247,57 +247,50 @@ OpenVINO™ provides several debug capabilities: * Model can be visualized to image from the xDot format: -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py - :language: python - :fragment: [ov:visualize] - - .. tab-item:: C++ - :sync: cpp - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp - :language: cpp - :fragment: [ov:visualize] - - -.. code-block:: sh + .. tab-set:: - `ov::pass::VisualizeTree` can be parametrized via environment variables: + .. tab-item:: Python + :sync: py - OV_VISUALIZE_TREE_OUTPUT_SHAPES=1 - visualize shapes + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py + :language: python + :fragment: [ov:visualize] - OV_VISUALIZE_TREE_OUTPUT_TYPES=1 - visualize types + .. tab-item:: C++ + :sync: cpp - OV_VISUALIZE_TREE_MIN_MAX_DENORMAL=1 - pretty denormal values + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp + :language: cpp + :fragment: [ov:visualize] - OV_VISUALIZE_TREE_RUNTIME_INFO=1 - print runtime information - OV_VISUALIZE_TREE_IO=1 - print I/O ports + ``ov::pass::VisualizeTree`` can be parametrized via environment variables: - OV_VISUALIZE_TREE_MEMBERS_NAME=1 - print member names + * ``OV_VISUALIZE_TREE_OUTPUT_SHAPES=1`` - visualize shapes + * ``OV_VISUALIZE_TREE_OUTPUT_TYPES=1`` - visualize types + * ``OV_VISUALIZE_TREE_MIN_MAX_DENORMAL=1`` - pretty denormal values + * ``OV_VISUALIZE_TREE_RUNTIME_INFO=1`` - print runtime information + * ``OV_VISUALIZE_TREE_IO=1`` - print I/O ports + * ``OV_VISUALIZE_TREE_MEMBERS_NAME=1`` - print member names * Also model can be serialized to IR: -.. tab-set:: + .. tab-set:: - .. tab-item:: Python - :sync: py + .. tab-item:: Python + :sync: py - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py - :language: python - :fragment: [ov:serialize] + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py + :language: python + :fragment: [ov:serialize] - .. tab-item:: C++ - :sync: cpp + .. tab-item:: C++ + :sync: cpp - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp - :language: cpp - :fragment: [ov:serialize] + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp + :language: cpp + :fragment: [ov:serialize] Additional Resources @@ -306,5 +299,3 @@ Additional Resources * :doc:`Available Operation Sets <../../../documentation/openvino-ir-format/operation-sets/available-opsets>`. * :doc:`OpenVINO™ Runtime Extensibility Developer Guide <../../../documentation/openvino-extensibility>`. * :doc:`Transformations Developer Guide <../../../documentation/openvino-extensibility/transformation-api>`. - - diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst index b8ec2da9235fd4..5f01623d248755 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/general-optimizations.rst @@ -18,7 +18,7 @@ Inputs Pre-Processing with OpenVINO In many cases, a network expects a pre-processed image. It is advised not to perform any unnecessary steps in the code: -* Model conversion API can efficiently incorporate the mean and normalization (scale) values into a model (for example, to the weights of the first convolution). For more details, see the :doc:`relevant model conversion API command-line parameters <../../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation>`. +* Model conversion API can efficiently incorporate the mean and normalization (scale) values into a model (for example, to the weights of the first convolution). For more details, see the :doc:`relevant model conversion API command-line parameters <../../../openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/preprocessing-api-details>`. * Let OpenVINO accelerate other means of :doc:`Image Pre-processing and Conversion ` * Data which is already in the "on-device" memory can be input directly by using the :doc:`remote tensors API of the GPU Plugin <../inference-devices-and-modes/gpu-device/remote-tensor-api-gpu-plugin>`. @@ -60,7 +60,7 @@ Below are example-codes for the regular and async-based approaches to compare: The technique can be generalized to any available parallel slack. For example, you can do inference and simultaneously encode the resulting or previous frames or run further inference, like emotion detection on top of the face detection results. -Refer to the `Object Detection C++ Demo `__ , `Object Detection Python Demo `__ (latency-oriented Async API showcase) and :doc:`Benchmark App Sample <../../../learn-openvino/openvino-samples/benchmark-tool>` for complete examples of the Async API in action. +Refer to the :doc:`Benchmark App Sample <../../../learn-openvino/openvino-samples/benchmark-tool>` for complete examples of the Async API in action. .. note:: diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst index 690b606ff3720a..1562165916e576 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst @@ -23,7 +23,6 @@ Below is a list of cases where input/output layout is important: * :doc:`Convert to OpenVINO <../../../model-preparation/convert-model-to-ir>` * `OpenVINO Model Conversion Tutorial `__ - * :doc:`[LEGACY] Model Optimizer Embedding Preprocessing Computation <../../../../documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-embedding-preprocessing-computation>` guide. * Improving the readability of a model input and output. diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput.rst index 18c18c5f7d05b8..8aafd9ceb4faec 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput.rst @@ -63,18 +63,7 @@ In general, most throughput-oriented inference applications should: * Use the Async API with callbacks, to avoid any dependency on the completion order of the requests and possible device starvation, as explained in the :doc:`common-optimizations section `. -Multi-Device Execution -###################### - -OpenVINO offers the automatic, scalable :doc:`multi-device inference mode <../../../documentation/legacy-features/multi-device>`, which is a simple *application-transparent* way to improve throughput. There is no need to re-architecture existing applications for any explicit multi-device support: no explicit network loading to each device, no separate per-device queues, no additional logic to balance inference requests between devices, etc. For the application using it, multi-device is like any other device, as it manages all processes internally. -Just like with other throughput-oriented scenarios, there are several major pre-requisites for optimal multi-device performance: - -* Using the :ref:`Asynchronous API ` and :doc:`callbacks <../integrate-openvino-with-your-application/inference-request>` in particular. -* Providing the multi-device (and hence the underlying devices) with enough data to crunch. As the inference requests are naturally independent data pieces, the multi-device performs load-balancing at the "requests" (outermost) level to minimize the scheduling overhead. - -Keep in mind that the resulting performance is usually a fraction of the "ideal" (plain sum) value, when the devices compete for certain resources such as the memory-bandwidth, which is shared between CPU and iGPU. - .. note:: - While the legacy approach of optimizing the parameters of each device separately works, the :doc:`Automatic Device Selection <../inference-devices-and-modes/auto-device-selection>` allow configuring all devices (that are part of the specific multi-device configuration) at once. + The :doc:`Automatic Device Selection <../inference-devices-and-modes/auto-device-selection>` allows configuration of all devices at once. diff --git a/docs/articles_en/openvino-workflow/torch-compile.rst b/docs/articles_en/openvino-workflow/torch-compile.rst index e5bc0ca901a5aa..8c6016bfd4742f 100644 --- a/docs/articles_en/openvino-workflow/torch-compile.rst +++ b/docs/articles_en/openvino-workflow/torch-compile.rst @@ -310,10 +310,84 @@ officially. However, it can be accessed by running the following instructions: if sys.version_info >= (3, 11): `raise RuntimeError("Python 3.11+ not yet supported for torch.compile") +TorchServe Integration ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +TorchServe is a performant, flexible, and easy to use tool for serving PyTorch models in production. For more information on the details of TorchServe, +you can refer to `TorchServe github repository. `__. With OpenVINO ``torch.compile`` integration into TorchServe you can serve +PyTorch models in production and accelerate them with OpenVINO on various Intel hardware. Detailed instructions on how to use OpenVINO with TorchServe are +available in `TorchServe examples. `__ and in a `use case app `__. + +Support for Automatic1111 Stable Diffusion WebUI ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Automatic1111 Stable Diffusion WebUI is an open-source repository that hosts a browser-based interface for the Stable Diffusion +based image generation. It allows users to create realistic and creative images from text prompts. +Stable Diffusion WebUI is supported on Intel CPUs, Intel integrated GPUs, and Intel discrete GPUs by leveraging OpenVINO +``torch.compile`` capability. Detailed instructions are available in +`Stable Diffusion WebUI repository. `__ + + +Model Quantization and Weights Compression +############################################# + +Model quantization and weights compression are effective methods for accelerating model inference and reducing memory consumption, with minimal impact on model accuracy. The `torch.compile` OpenVINO backend supports two key model optimization APIs: + +1. Neural Network Compression Framework (`NNCF `__). NNCF offers advanced algorithms for post-training quantization and weights compression in the OpenVINO toolkit. + +2. PyTorch 2 export quantization. A general-purpose API designed for quantizing models captured by ``torch.export``. + +NNCF is the recommended approach for model quantization and weights compression. NNCF specifically optimizes models for the OpenVINO backend, providing optimal results in terms of inference speed and accuracy. + + +NNCF Model Optimization Support (Preview) ++++++++++++++++++++++++++++++++++++++++++++++ + +The Neural Network Compression Framework (`NNCF `__) implements advanced quantization and weights compression algorithms, which can be applied to ``torch.fx.GraphModule`` to speed up inference +and decrease memory consumption. + +Model quantization example: + +.. code-block:: python + + import nncf + import openvino.torch + import torch + + calibration_loader = torch.utils.data.DataLoader(...) + + def transform_fn(data_item): + images, _ = data_item + return images + + # Model quantization + quantized_model = nncf.quantize(model, calibration_dataset) + + quantized_model = torch.compile(quantized_model, backend="openvino") + +Model weights compression example: + +.. code-block:: python + + import nncf + import openvino.torch + import torch + + # Weights compression + compressed_model = nncf.compress_model(model) + + compressed_model = torch.compile(compressed_model, backend="openvino") + +NNCF unlocks the full potential of low-precision OpenVINO kernels due to the placement of quantizers designed specifically for the OpenVINO. +Advanced algorithms like ``SmoothQuant`` or ``BiasCorrection`` allow further metrics improvement while minimizing the outputs discrepancies between the original and compressed models. +For further details, please see the `documentation `__ +and a `tutorial `__. + Support for PyTorch 2 export quantization (Preview) +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -PyTorch 2 export quantization is supported by OpenVINO backend in ``torch.compile``. To be able +NNCF is the default way to compress models for the OpenVINO backend, however +PyTorch 2 export quantization is supported by OpenVINO backend in ``torch.compile`` as well. To be able to access this feature, follow the steps provided in `PyTorch 2 Export Post Training Quantization with X86 Backend through Inductor `__ and update the provided sample as explained below. @@ -347,24 +421,6 @@ and update the provided sample as explained below. optimized_model = torch.compile(converted_model, backend="openvino") -TorchServe Integration -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -TorchServe is a performant, flexible, and easy to use tool for serving PyTorch models in production. For more information on the details of TorchServe, -you can refer to `TorchServe github repository. `__. With OpenVINO ``torch.compile`` integration into TorchServe you can serve -PyTorch models in production and accelerate them with OpenVINO on various Intel hardware. Detailed instructions on how to use OpenVINO with TorchServe are -available in `TorchServe examples. `__ - -Support for Automatic1111 Stable Diffusion WebUI -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -Automatic1111 Stable Diffusion WebUI is an open-source repository that hosts a browser-based interface for the Stable Diffusion -based image generation. It allows users to create realistic and creative images from text prompts. -Stable Diffusion WebUI is supported on Intel CPUs, Intel integrated GPUs, and Intel discrete GPUs by leveraging OpenVINO -``torch.compile`` capability. Detailed instructions are available in -`Stable Diffusion WebUI repository. `__ - - Architecture ################# diff --git a/docs/dev/build_mac_arm.md b/docs/dev/build_mac_arm.md index 5a1a3698568f95..8b9781e46a5c96 100644 --- a/docs/dev/build_mac_arm.md +++ b/docs/dev/build_mac_arm.md @@ -14,14 +14,14 @@ The software was validated on: - [brew](https://brew.sh) package manager to install additional dependencies. Use [install brew](https://brew.sh) guide to achieve this. - Installation step for python and python libraries varies depending on the host architecture: - - **arm64** Python 3.9 - 3.12 for the OpenVINO Runtime Python API, Development tools (Model Optimizer, POT and others): + - **arm64** Python 3.9 - 3.12 for the OpenVINO Runtime Python API: ```sh % # let's have a look what python versions are available in brew % brew search python % # select preferred version of python based on available ones, e.g. 3.11 % brew install python@3.11 ``` - - **x86_64** Select universal2 installer from [Python releases](https://www.python.org/downloads/macos/) download page and install `python-3.X.Y-macos11.pkg` image. This allows to have universal python libraries, build x86_64 OpenVINO Python API and Development tools. + - **x86_64** Select universal2 installer from [Python releases](https://www.python.org/downloads/macos/) download page and install `python-3.X.Y-macos11.pkg` image. This allows you to have universal python libraries of OpenVINO Python API (build x86_64). - Clang compiler and other command line tools from Xcode 10.1 or higher: ```sh @@ -35,13 +35,13 @@ The software was validated on: ```sh % brew install tbb pugixml flatbuffers snappy protobuf ``` -- Additional `pip` dependencies to build OpenVINO Runtime Python API, Development tools (Model Optimizer, POT and others): +- Additional `pip` dependencies to build OpenVINO Runtime Python API: ```sh % # update pip and setuptools to newer versions % python3 -m pip install -U pip % python3 -m pip install -r /src/bindings/python/requirements.txt ``` - Additional install requirements (after OpenVINO repo clone) in order to build OpenVINO Python API and Development tools as wheel packages: + Additional install requirements (after OpenVINO repo clone) in order to build OpenVINO Python API as wheel packages: ```sh % python3 -m pip install -r /src/bindings/python/wheel/requirements-dev.txt ``` diff --git a/docs/dev/build_mac_intel_cpu.md b/docs/dev/build_mac_intel_cpu.md index f5b70d73709c20..735c8a97a3b3df 100644 --- a/docs/dev/build_mac_intel_cpu.md +++ b/docs/dev/build_mac_intel_cpu.md @@ -12,14 +12,14 @@ The software was validated on: - [brew](https://brew.sh) package manager to install additional dependencies. Use [install brew](https://brew.sh) guide to achieve this. - Installation step for python and python libraries varies depending on the host architecture: - - **x86_64** Python 3.9 - 3.12 for the OpenVINO Runtime Python API, Development tools (Model Optimizer, POT and others): + - **x86_64** Python 3.9 - 3.12 for the OpenVINO Runtime Python API: ```sh % # let's have a look what python versions are available in brew % brew search python % # select preferred version of python based on available ones, e.g. 3.11 % brew install python@3.11 ``` - - **arm64** Select universal2 installer from [Python releases](https://www.python.org/downloads/macos/) download page and install `python-3.X.Y-macos11.pkg` image. This allows to have universal python libraries, build x86_64 OpenVINO Python API and Development tools. + - **arm64** Select universal2 installer from [Python releases](https://www.python.org/downloads/macos/) download page and install `python-3.X.Y-macos11.pkg` image. This allows to have universal python libraries of OpenVINO Python API (build x86_64) . - [CMake](https://cmake.org/download/) 3.13 or higher and other development tools: ```sh % brew install cmake scons fdupes git-lfs ninja @@ -32,13 +32,13 @@ The software was validated on: ```sh % brew install tbb pugixml flatbuffers snappy protobuf ``` -- Additional `pip` dependencies to build OpenVINO Runtime Python API, Development tools (Model Optimizer, POT and others): +- Additional `pip` dependencies to build OpenVINO Runtime Python API: ```sh % # update pip and setuptools to newer versions % python3 -m pip install -U pip % python3 -m pip install -r /src/bindings/python/requirements.txt ``` - Additional install requirements (after OpenVINO repo clone) in order to build OpenVINO Python API and Development tools as wheel packages: + Additional install requirements (after OpenVINO repo clone) in order to build OpenVINO Python API: ```sh % python3 -m pip install -r /src/bindings/python/wheel/requirements-dev.txt ``` diff --git a/docs/dev/installing.md b/docs/dev/installing.md index de4c7ba9df9af6..c20b2ce183de3c 100644 --- a/docs/dev/installing.md +++ b/docs/dev/installing.md @@ -6,200 +6,87 @@ Once the project is built you can install OpenVINO™ Runtime into custom locati cmake --install --prefix ``` -## Installation check +## Build and Run Samples -
-For versions prior to 2022.1 -

+1. Build samples. -1. Obtaining Open Model Zoo tools and models + To build C++ sample applications, run the following commands: -To have the ability to run samples and demos, you need to clone the Open Model Zoo repository and copy the folder under `./deployment_tools` to your install directory: + Linux and macOS: + ```sh + cd /samples/cpp + ./build_samples.sh + ``` -``` -git clone https://github.com/openvinotoolkit/open_model_zoo.git -cmake -E copy_directory ./open_model_zoo/ /deployment_tools/open_model_zoo/ -``` - -2. Adding OpenCV to your environment - -Open Model Zoo samples use OpenCV functionality to load images. To use it for demo builds you need to provide the path to your OpenCV custom build by setting `OpenCV_DIR` environment variable and add path OpenCV libraries to the `LD_LIBRARY_PATH (Linux)` or `PATH (Windows)` variable before running demos. - -Linux: -```sh -export LD_LIBRARY_PATH=/path/to/opencv_install/lib/:$LD_LIBRARY_PATH -export OpenCV_DIR=/path/to/opencv_install/cmake -``` - -Windows: -```sh -set PATH=\path\to\opencv_install\bin\;%PATH% -set OpenCV_DIR=\path\to\opencv_install\cmake -``` - -3. Running demo - -To check your installation go to the demo directory and run Classification Demo: - -Linux and macOS: -```sh -cd /deployment_tools/demo -./demo_squeezenet_download_convert_run.sh -``` - -Windows: -```sh -cd \deployment_tools\demo -demo_squeezenet_download_convert_run.bat -``` - -Result: -``` -Top 10 results: + Windows Command Prompt: + ```sh + cd \samples\cpp + build_samples_msvc.bat + ``` -Image /deployment_tools/demo/car.png - -classid probability label -------- ----------- ----- -817 0.6853030 sports car, sport car -479 0.1835197 car wheel -511 0.0917197 convertible -436 0.0200694 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon -751 0.0069604 racer, race car, racing car -656 0.0044177 minivan -717 0.0024739 pickup, pickup truck -581 0.0017788 grille, radiator grille -468 0.0013083 cab, hack, taxi, taxicab -661 0.0007443 Model T - -[ INFO ] Execution successful -``` + Windows PowerShell: + ```sh + & /build_samples.ps1 + ``` -

-
+2. Download a model. + You can download an image classification model from + [Hugging Face](https://huggingface.co/models?pipeline_tag=image-classification&sort=trending) + to run the sample -
- For 2022.1 and after -

+4. Convert the model. -1. Build samples + Linux and macOS: + ```sh + ovc --compress_to_fp16=True + ``` + Windows: + ```bat + ovc --compress_to_fp16=True + ``` -To build C++ sample applications, run the following commands: +5. Run inference on the sample. -Linux and macOS: -```sh -cd /samples/cpp -./build_samples.sh -``` + Set up the OpenVINO environment variables: -Windows Command Prompt: -```sh -cd \samples\cpp -build_samples_msvc.bat -``` + Linux and macOS: + ```sh + source /setupvars.sh + ``` -Windows PowerShell: -```sh -& /build_samples.ps1 -``` + Windows Command Prompt: + ```bat + \setupvars.bat + ``` -2. Install OpenVINO Development Tools + Windows PowerShell: + ```bat + . /setupvars.ps1 + ``` -> **NOTE**: To build OpenVINO Development Tools (Model Optimizer, Post-Training Optimization Tool, Model Downloader, and Open Model Zoo tools) wheel package locally you are required to use the CMake option: `-DENABLE_WHEEL=ON`. + The following commands run the Image Classification Code Sample using the [`dog.bmp`](https://storage.openvinotoolkit.org/data/test_data/images/ 224x224/dog.bmp) file as an input image, the model in IR format, and on different hardware devices: -To install OpenVINO Development Tools to work with Caffe models (OpenVINO support for Caffe is currently being deprecated and will be removed entirely in the future), execute the following commands: + Linux and macOS: -Linux and macOS: + ```sh + cd ~/openvino_cpp_samples_build//Release + ./classification_sample_async -i /dog.bmp -m /model.xml -d CPU + ``` + where the is the output of ``uname -m``, for example, ``intel64``, ``armhf``, or ``aarch64``. -```sh -#setup virtual environment -python3 -m venv openvino_env -source openvino_env/bin/activate -pip install pip --upgrade + Windows: -#install local package from install directory -pip install openvino_dev--py3-none-any.whl[caffe] --find-links=/tools -``` - -Windows: -```bat -rem setup virtual environment -python -m venv openvino_env -openvino_env\Scripts\activate.bat -pip install pip --upgrade - -rem install local package from install directory -cd \tools -pip install openvino_dev--py3-none-any.whl[caffe] --find-links=\tools -``` - -3. Download the Models - -Download the following model to run the Image Classification Sample: - -Linux and macOS: -```sh -omz_downloader --name googlenet-v1 --output_dir ~/models -``` - -Windows: -```bat -omz_downloader --name googlenet-v1 --output_dir %USERPROFILE%\Documents\models -``` - -4. Convert the Model with Model Optimizer - -Linux and macOS: -```sh -mkdir ~/ir -mo --input_model ~/models/public/googlenet-v1/googlenet-v1.caffemodel --compress_to_fp16 --output_dir ~/ir -``` -Windows: -```bat -mkdir %USERPROFILE%\Documents\ir -mo --input_model %USERPROFILE%\Documents\models\public\googlenet-v1\googlenet-v1.caffemodel --compress_to_fp16 --output_dir %USERPROFILE%\Documents\ir -``` - -5. Run Inference on the Sample - -Set up the OpenVINO environment variables: - -Linux and macOS: -```sh -source /setupvars.sh -``` - -Windows Command Prompt: -```bat -\setupvars.bat -``` - -Windows PowerShell: -```bat -. /setupvars.ps1 -``` - -The following commands run the Image Classification Code Sample using the [`dog.bmp`](https://storage.openvinotoolkit.org/data/test_data/images/224x224/dog.bmp) file as an input image, the model in IR format from the `ir` directory, and on different hardware devices: - -Linux and macOS: - -```sh -cd ~/openvino_cpp_samples_build//Release -./classification_sample_async -i ~/Downloads/dog.bmp -m ~/ir/googlenet-v1.xml -d CPU -``` -where the is the output of ``uname -m``, for example, ``intel64``, ``armhf``, or ``aarch64``. - -Windows: - -```bat -cd %USERPROFILE%\Documents\Intel\OpenVINO\openvino_cpp_samples_build\\Release -.\classification_sample_async.exe -i %USERPROFILE%\Downloads\dog.bmp -m %USERPROFILE%\Documents\ir\googlenet-v1.xml -d CPU -``` -where the is either ``intel64`` or ``aarch64`` depending on the platform architecture. + ```bat + cd %USERPROFILE%\Documents\Intel\OpenVINO\openvino_cpp_samples_build\\Release + .\classification_sample_async.exe -i \dog.bmp -m \model.xml -d CPU + ``` + where the is either ``intel64`` or ``aarch64`` depending on the platform architecture. When the sample application is complete, you see the label and confidence data for the top 10 categories on the display: +Below are results of using the googlenet-v1 model. + ``` Top 10 results: @@ -220,36 +107,9 @@ classid probability ``` -

-
## Adding OpenVINO Runtime to Your Project -
-For versions prior to 2022.1 -

- -For CMake projects, set the `InferenceEngine_DIR` and when you run CMake tool: - -```sh -cmake -DInferenceEngine_DIR=/path/to/openvino/build/ . -``` - -Then you can find Inference Engine by [`find_package`]: - -```cmake -find_package(InferenceEngine REQUIRED) -target_link_libraries(${PROJECT_NAME} PRIVATE ${InferenceEngine_LIBRARIES}) -``` -

-
- - -
-For 2022.1 and after -

- - For CMake projects, set the `OpenVINO_DIR` and when you run CMake tool: ```sh @@ -266,8 +126,6 @@ target_link_libraries(ov_app PRIVATE openvino::runtime) add_executable(ov_c_app main.c) target_link_libraries(ov_c_app PRIVATE openvino::runtime::c) ``` -

-
## See also diff --git a/docs/dev/ov_dependencies.txt b/docs/dev/ov_dependencies.txt index d9c344d2c3048d..cb64e4d5a6534c 100644 --- a/docs/dev/ov_dependencies.txt +++ b/docs/dev/ov_dependencies.txt @@ -1,6 +1,6 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -#This file provides a comprehensive list of all dependencies of OpenVINO 2024.5 +#This file provides a comprehensive list of all dependencies of OpenVINO 2024.6 #The file is part of the automation pipeline for posting OpenVINO IR models on the HuggingFace Hub, including OneBOM dependency checks. diff --git a/docs/nbdoc/consts.py b/docs/nbdoc/consts.py index 1a4d3a13049041..e5c5d4773dce4c 100644 --- a/docs/nbdoc/consts.py +++ b/docs/nbdoc/consts.py @@ -6,7 +6,7 @@ repo_owner = "openvinotoolkit" repo_name = "openvino_notebooks" repo_branch = "tree/main" -artifacts_link = "http://repository.toolbox.iotg.sclab.intel.com/projects/ov-notebook/0.1.0-latest/20241120220806/dist/rst_files/" +artifacts_link = "http://repository.toolbox.iotg.sclab.intel.com/projects/ov-notebook/0.1.0-latest/20241209220902/dist/rst_files/" blacklisted_extensions = ['.xml', '.bin'] notebooks_repo = "https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/" notebooks_binder = "https://mybinder.org/v2/gh/openvinotoolkit/openvino_notebooks/HEAD?filepath=" diff --git a/docs/notebooks/3D-pose-estimation-with-output.rst b/docs/notebooks/3D-pose-estimation-with-output.rst index 9e09d96094fc78..7959bf48a75e45 100644 --- a/docs/notebooks/3D-pose-estimation-with-output.rst +++ b/docs/notebooks/3D-pose-estimation-with-output.rst @@ -113,58 +113,57 @@ Lab instead.** Collecting torch Using cached https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl (194.9 MB) Collecting tqdm - Using cached tqdm-4.67.0-py3-none-any.whl.metadata (57 kB) - Requirement already satisfied: ipywidgets>=7.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (8.1.5) + Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB) + Requirement already satisfied: ipywidgets>=7.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (8.1.5) Collecting ipydatawidgets>=1.1.1 (from pythreejs) Using cached ipydatawidgets-4.3.5-py2.py3-none-any.whl.metadata (1.4 kB) Collecting numpy (from pythreejs) Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB) - Requirement already satisfied: traitlets in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (5.14.3) + Requirement already satisfied: traitlets in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (5.14.3) Collecting openvino-telemetry>=2023.2.1 (from openvino>=2024.4.0) Using cached openvino_telemetry-2024.5.0-py3-none-any.whl.metadata (2.3 kB) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2024.4.0) (24.2) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2024.4.0) (24.2) Collecting filelock (from torch) Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB) - Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) + Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) Collecting sympy (from torch) Using cached sympy-1.13.3-py3-none-any.whl.metadata (12 kB) Collecting networkx (from torch) Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) + Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) Collecting fsspec (from torch) Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB) Collecting traittypes>=0.2.0 (from ipydatawidgets>=1.1.1->pythreejs) Using cached traittypes-0.2.1-py2.py3-none-any.whl.metadata (1.0 kB) - Requirement already satisfied: comm>=0.1.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (0.2.2) - Requirement already satisfied: ipython>=6.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (8.12.3) - Requirement already satisfied: widgetsnbextension~=4.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (4.0.13) - Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (3.0.13) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) + Requirement already satisfied: comm>=0.1.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (0.2.2) + Requirement already satisfied: ipython>=6.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (8.12.3) + Requirement already satisfied: widgetsnbextension~=4.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (4.0.13) + Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (3.0.13) + Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) INFO: pip is looking at multiple versions of networkx to determine which version is compatible with other requirements. This could take a while. Collecting networkx (from torch) Using cached networkx-3.1-py3-none-any.whl.metadata (5.3 kB) Collecting mpmath<1.4,>=1.1.0 (from sympy->torch) Using cached https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB) - Requirement already satisfied: backcall in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.0) - Requirement already satisfied: decorator in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (5.1.1) - Requirement already satisfied: jedi>=0.16 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.19.2) - Requirement already satisfied: matplotlib-inline in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.1.7) - Requirement already satisfied: pickleshare in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.5) - Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.48) - Requirement already satisfied: pygments>=2.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.18.0) - Requirement already satisfied: stack-data in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.6.3) - Requirement already satisfied: pexpect>4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (4.9.0) - Requirement already satisfied: parso<0.9.0,>=0.8.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.8.4) - Requirement already satisfied: ptyprocess>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.0) - Requirement already satisfied: wcwidth in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.13) - Requirement already satisfied: executing>=1.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.1.0) - Requirement already satisfied: asttokens>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.4.1) - Requirement already satisfied: pure-eval in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.3) - Requirement already satisfied: six>=1.12.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (1.16.0) + Requirement already satisfied: backcall in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.0) + Requirement already satisfied: decorator in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (5.1.1) + Requirement already satisfied: jedi>=0.16 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.19.2) + Requirement already satisfied: matplotlib-inline in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.1.7) + Requirement already satisfied: pickleshare in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.5) + Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.48) + Requirement already satisfied: pygments>=2.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.18.0) + Requirement already satisfied: stack-data in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.6.3) + Requirement already satisfied: pexpect>4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (4.9.0) + Requirement already satisfied: parso<0.9.0,>=0.8.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.8.4) + Requirement already satisfied: ptyprocess>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.0) + Requirement already satisfied: wcwidth in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.13) + Requirement already satisfied: executing>=1.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.1.0) + Requirement already satisfied: asttokens>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.0) + Requirement already satisfied: pure-eval in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.3) Using cached pythreejs-2.4.2-py3-none-any.whl (3.4 MB) Using cached openvino-2024.4.0-16579-cp38-cp38-manylinux2014_x86_64.whl (42.6 MB) Using cached opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (62.5 MB) - Using cached tqdm-4.67.0-py3-none-any.whl (78 kB) + Using cached tqdm-4.67.1-py3-none-any.whl (78 kB) Using cached ipydatawidgets-4.3.5-py2.py3-none-any.whl (271 kB) Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB) Using cached openvino_telemetry-2024.5.0-py3-none-any.whl (23 kB) @@ -174,7 +173,7 @@ Lab instead.** Using cached sympy-1.13.3-py3-none-any.whl (6.2 MB) Using cached traittypes-0.2.1-py2.py3-none-any.whl (8.6 kB) Installing collected packages: openvino-telemetry, mpmath, traittypes, tqdm, sympy, numpy, networkx, fsspec, filelock, torch, openvino, opencv-python, ipydatawidgets, pythreejs - Successfully installed filelock-3.16.1 fsspec-2024.10.0 ipydatawidgets-4.3.5 mpmath-1.3.0 networkx-3.1 numpy-1.24.4 opencv-python-4.10.0.84 openvino-2024.4.0 openvino-telemetry-2024.5.0 pythreejs-2.4.2 sympy-1.13.3 torch-2.4.1+cpu tqdm-4.67.0 traittypes-0.2.1 + Successfully installed filelock-3.16.1 fsspec-2024.10.0 ipydatawidgets-4.3.5 mpmath-1.3.0 networkx-3.1 numpy-1.24.4 opencv-python-4.10.0.84 openvino-2024.4.0 openvino-telemetry-2024.5.0 pythreejs-2.4.2 sympy-1.13.3 torch-2.4.1+cpu tqdm-4.67.1 traittypes-0.2.1 Note: you may need to restart the kernel to use updated packages. @@ -250,7 +249,7 @@ Download the model .. parsed-literal:: - model/human-pose-estimation-3d.tar.gz: 0%| | 0.00/17.6M [00:00`__ tackles the task of generating animation sequences from a single character image. It @@ -34,10 +36,14 @@ Learn more in `GitHub repo `__ and `paper `__. -.. warning:: +.. container:: alert alert-warning - This tutorial requires at least **96 GB** of RAM for model conversion and **40 GB** for inference. Changing the values of ``HEIGHT``, ``WIDTH`` and ``VIDEO_LENGTH`` variables will change the memory consumption but will also affect accuracy. + :: +

! WARNING !

+

+ This tutorial requires at least 96 GB of RAM for model conversion and 40 GB for inference. Changing the values of HEIGHT, WIDTH and VIDEO_LENGTH variables will change the memory consumption but will also affect accuracy. +

**Table of contents:** @@ -70,9 +76,6 @@ need a Jupyter server to start. For details, please refer to `Installation Guide `__. -.. |image0| image:: https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/animate-anyone/animate-anyone.gif - - Prerequisites ------------- @@ -110,13 +113,6 @@ Prerequisites %load_ext skip_kernel_extension - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - Note that we clone a fork of original repo with tweaked forward methods. .. code:: ipython3 @@ -171,9 +167,11 @@ Note that we clone a fork of original repo with tweaked forward methods. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. + torch.utils._pytree._register_pytree_node( + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. torch.utils._pytree._register_pytree_node( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. torch.utils._pytree._register_pytree_node( @@ -221,13 +219,6 @@ Prepare base model local_dir=local_dir, ) - - -.. parsed-literal:: - - diffusion_pytorch_model.bin: 0%| | 0.00/3.44G [00:00:2: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - :6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - :9: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - - Convert model to OpenVINO IR ---------------------------- @@ -423,7 +337,7 @@ semantic features are extracted through the CLIP image encoder for Cross-Attention. Temporal-Attention operates in the temporal dimension. Finally, the VAE decoder decodes the result into a video clip. -|image01| +.. image:: https://humanaigc.github.io/animate-anyone/static/images/f2_img.png The pipeline contains 6 PyTorch modules: @@ -463,8 +377,6 @@ compression parameters. More details about weights compression can be found in `OpenVINO documentation `__. -.. |image01| image:: https://humanaigc.github.io/animate-anyone/static/images/f2_img.png - .. code:: ipython3 %%skip not $SHOULD_CONVERT @@ -522,12 +434,14 @@ of the pipeline, it will be better to convert them to separate models. .. parsed-literal:: + WARNING:nncf:NNCF provides best results with torch==2.1.2, while current torch version is 2.2.2+cpu. If you encounter issues, consider switching to torch==2.1.2 INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (32 / 32) │ 100% (32 / 32) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (32 / 32) | 100% (32 / 32) | + +--------------+---------------------------+-----------------------------------+ @@ -543,6 +457,14 @@ of the pipeline, it will be better to convert them to separate models. + + + + + + + + .. code:: ipython3 %%skip not $SHOULD_CONVERT @@ -568,11 +490,12 @@ of the pipeline, it will be better to convert them to separate models. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (40 / 40) │ 100% (40 / 40) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (40 / 40) | 100% (40 / 40) | + +--------------+---------------------------+-----------------------------------+ @@ -588,6 +511,14 @@ of the pipeline, it will be better to convert them to separate models. + + + + + + + + Reference UNet ~~~~~~~~~~~~~~ @@ -634,11 +565,12 @@ step. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (270 / 270) │ 100% (270 / 270) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (270 / 270) | 100% (270 / 270) | + +--------------+---------------------------+-----------------------------------+ @@ -654,6 +586,14 @@ step. + + + + + + + + Denoising UNet ~~~~~~~~~~~~~~ @@ -727,11 +667,12 @@ step. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (534 / 534) │ 100% (534 / 534) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (534 / 534) | 100% (534 / 534) | + +--------------+---------------------------+-----------------------------------+ @@ -747,6 +688,14 @@ step. + + + + + + + + Pose Guider ~~~~~~~~~~~ @@ -773,11 +722,12 @@ efficiently integrate pose control signals into the denoising process. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (8 / 8) │ 100% (8 / 8) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (8 / 8) | 100% (8 / 8) | + +--------------+---------------------------+-----------------------------------+ @@ -793,6 +743,14 @@ efficiently integrate pose control signals into the denoising process. + + + + + + + + Image Encoder ~~~~~~~~~~~~~ @@ -818,19 +776,19 @@ required for both reference and denoising UNets. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/transformers/modeling_utils.py:4225: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (146 / 146) │ 100% (146 / 146) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (146 / 146) | 100% (146 / 146) | + +--------------+---------------------------+-----------------------------------+ @@ -846,6 +804,14 @@ required for both reference and denoising UNets. + + + + + + + + Inference --------- @@ -871,6 +837,15 @@ For starting work, please select inference device from dropdown list. device = device_widget() + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=5, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'GPU.3', 'AUTO'), value='A… + + + .. code:: ipython3 class OVPose2VideoPipeline(Pose2VideoPipeline): @@ -1168,7 +1143,7 @@ Video post-processing .. raw:: html @@ -1242,23 +1217,9 @@ Interactive inference demo = make_demo(fn=generate) try: - demo.queue().launch(debug=False) + demo.queue().launch(debug=True) except Exception: - demo.queue().launch(debug=False, share=True) + demo.queue().launch(debug=True, share=True) # if you are launching remotely, specify server_name and server_port # demo.launch(server_name='your server name', server_port='server port in int') # Read more in the docs: https://gradio.app/docs/" - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/async-api-with-output.rst b/docs/notebooks/async-api-with-output.rst index 9f9130a4fe0db2..86ff1bc0aa9b0c 100644 --- a/docs/notebooks/async-api-with-output.rst +++ b/docs/notebooks/async-api-with-output.rst @@ -59,7 +59,14 @@ Imports .. code:: ipython3 %pip install -q "openvino>=2023.1.0" - %pip install -q opencv-python "matplotlib>=3.4" + %pip install -q opencv-python tqdm "matplotlib>=3.4" + + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + .. code:: ipython3 @@ -97,26 +104,36 @@ the person in each frame of the video. .. code:: ipython3 + from pathlib import Path + # directory where model will be downloaded base_model_dir = "model" # model name as named in Open Model Zoo model_name = "person-detection-0202" precision = "FP16" - model_path = f"model/intel/{model_name}/{precision}/{model_name}.xml" - download_command = f"omz_downloader " f"--name {model_name} " f"--precision {precision} " f"--output_dir {base_model_dir} " f"--cache_dir {base_model_dir}" - ! $download_command + model_path = Path("model") / f"{model_name}.xml" + + base_model_url = "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1" + + if not Path(model_path).exists(): + utils.download_file(f"{base_model_url}/{model_name}/{precision}/{model_name}.xml", filename=model_path.name, directory=model_path.parent) + utils.download_file( + f"{base_model_url}/{model_name}/{precision}/{model_name}.bin", filename=model_path.name.replace(".xml", ".bin"), directory=model_path.parent + ) + .. parsed-literal:: - ################|| Downloading person-detection-0202 ||################ - - ========== Retrieving model/intel/person-detection-0202/FP16/person-detection-0202.xml from the cache - - ========== Retrieving model/intel/person-detection-0202/FP16/person-detection-0202.bin from the cache - - + person-detection-0202.xml: 0%| | 0.00/249k [00:00=4.30.2", "controlnet-aux>=0.0.6", "gradio>=3.36", + "datasets>=2.14.6", + "nncf>=2.7.0", + "opencv-python", "--extra-index-url", "https://download.pytorch.org/whl/cpu", ) - pip_install("openvino>=2023.1.0", "datasets>=2.14.6", "nncf>=2.7.0", "opencv-python") + pip_install("openvino>=2023.1.0") Instantiating Generation Pipeline --------------------------------- @@ -296,7 +299,7 @@ Now, let us check its result on example image: image_path = Path("example_image.jpg") if not image_path.exists(): - download_file(image_path, filename="example_image.jpg") + download_file(example_url, filename="example_image.jpg") img = Image.open(image_path) pose = pose_estimator(img) diff --git a/docs/notebooks/convert-to-openvino-with-output.rst b/docs/notebooks/convert-to-openvino-with-output.rst index 507dd407eae739..bc5a45f244e376 100644 --- a/docs/notebooks/convert-to-openvino-with-output.rst +++ b/docs/notebooks/convert-to-openvino-with-output.rst @@ -184,10 +184,10 @@ NLP model from Hugging Face and export it in ONNX format: .. parsed-literal:: - 2024-11-22 00:16:16.864961: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:16:16.903350: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:09:00.018226: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:09:00.080568: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:16:17.575066: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:09:00.743048: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT .. parsed-literal:: @@ -660,7 +660,7 @@ frameworks conversion guides. .. parsed-literal:: - 2024-11-22 00:16:33.997234: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. + 2024-12-09 23:09:17.262024: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. Skipping registering GPU devices... diff --git a/docs/notebooks/convnext-classification-with-output.rst b/docs/notebooks/convnext-classification-with-output.rst index 9466f30c22898e..1204ea2c17f106 100644 --- a/docs/notebooks/convnext-classification-with-output.rst +++ b/docs/notebooks/convnext-classification-with-output.rst @@ -192,7 +192,7 @@ And print results Predicted Class: 281 Predicted Label: n02123045 tabby, tabby cat - Predicted Probability: 0.5919997096061707 + Predicted Probability: 0.5351971983909607 Convert the model to OpenVINO Intermediate representation format diff --git a/docs/notebooks/cross-lingual-books-alignment-with-output.rst b/docs/notebooks/cross-lingual-books-alignment-with-output.rst index b116f0e1f5cda1..68f51ad137ff16 100644 --- a/docs/notebooks/cross-lingual-books-alignment-with-output.rst +++ b/docs/notebooks/cross-lingual-books-alignment-with-output.rst @@ -32,7 +32,7 @@ Prerequisites - ``requests`` - for getting books - ``pysbd`` - for splitting sentences -- ``transformers[torch]`` and ``openvino_dev`` - for getting sentence +- ``transformers[torch]`` and ``openvino`` - for getting sentence embeddings - ``seaborn`` - for alignment matrix visualization - ``ipywidgets`` - for displaying HTML and JS output in the notebook @@ -416,12 +416,12 @@ languages. It has the same architecture as the BERT model but has been trained on a different task: to produce identical embeddings for translation pairs. -|image01| +|image02| This makes LaBSE a great choice for our task and it can be reused for different language pairs still producing good results. -.. |image01| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/627d3a39-7076-479f-a7b1-392f49a0b83e +.. |image02| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/627d3a39-7076-479f-a7b1-392f49a0b83e .. code:: ipython3 diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst b/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst index c3d645f1841a17..853da533385284 100644 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst +++ b/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst @@ -154,10 +154,10 @@ Imports .. parsed-literal:: - 2024-11-22 00:16:56.689204: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:16:56.724390: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:09:41.789833: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:09:41.824673: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:16:57.319913: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:09:42.418712: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT .. parsed-literal:: @@ -213,7 +213,7 @@ notebook `__. .. parsed-literal:: - pretrained_model/unet_kits19_state_dict.pth: 0%| | 0.00/7.58M [00:00`__. .. parsed-literal:: - /tmp/ipykernel_3514722/1592321960.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /tmp/ipykernel_2165966/1592321960.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. state_dict = torch.load(state_dict_file, map_location=torch.device("cpu")) @@ -444,7 +444,7 @@ this notebook. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if x_e.shape[-i - 1] != x_0.shape[-i - 1]: @@ -526,18 +526,18 @@ Convert quantized model to OpenVINO IR model and save it. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:340: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:340: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! return self._level_low.item() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:348: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:348: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! return self._level_high.item() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if x_e.shape[-i - 1] != x_0.shape[-i - 1]: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: Tensor-likes are not close! - Mismatched elements: 247214 / 262144 (94.3%) - Greatest absolute difference: 4.1846349239349365 at index (0, 0, 379, 430) (up to 1e-05 allowed) - Greatest relative difference: 15984.079041034269 at index (0, 0, 447, 390) (up to 1e-05 allowed) + Mismatched elements: 250458 / 262144 (95.5%) + Greatest absolute difference: 3.8674159049987793 at index (0, 0, 351, 76) (up to 1e-05 allowed) + Greatest relative difference: 12206.866810726728 at index (0, 0, 144, 31) (up to 1e-05 allowed) _check_trace( @@ -663,7 +663,7 @@ be run in the notebook with ``! benchmark_app`` or [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 8.99 ms + [ INFO ] Read model took 8.90 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] x (node: x) : f32 / [...] / [1,1,512,512] @@ -677,7 +677,7 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Model outputs: [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 240.78 ms + [ INFO ] Compile model took 264.91 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -714,17 +714,17 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Fill input 'x' with random values [Step 10/11] Measuring performance (Start inference synchronously, limits: 15000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 49.70 ms + [ INFO ] First inference took 48.49 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 425 iterations - [ INFO ] Duration: 15023.51 ms + [ INFO ] Count: 431 iterations + [ INFO ] Duration: 15002.34 ms [ INFO ] Latency: - [ INFO ] Median: 34.55 ms - [ INFO ] Average: 35.13 ms - [ INFO ] Min: 34.21 ms - [ INFO ] Max: 47.23 ms - [ INFO ] Throughput: 28.29 FPS + [ INFO ] Median: 34.52 ms + [ INFO ] Average: 34.59 ms + [ INFO ] Min: 34.20 ms + [ INFO ] Max: 36.19 ms + [ INFO ] Throughput: 28.73 FPS .. code:: ipython3 @@ -750,7 +750,7 @@ be run in the notebook with ``! benchmark_app`` or [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 11.10 ms + [ INFO ] Read model took 10.56 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] x (node: x) : f32 / [...] / [1,1,512,512] @@ -764,7 +764,7 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Model outputs: [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 251.41 ms + [ INFO ] Compile model took 248.98 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model49 @@ -801,17 +801,17 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Fill input 'x' with random values [Step 10/11] Measuring performance (Start inference synchronously, limits: 15000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 29.68 ms + [ INFO ] First inference took 29.18 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 911 iterations - [ INFO ] Duration: 15009.49 ms + [ INFO ] Count: 908 iterations + [ INFO ] Duration: 15011.20 ms [ INFO ] Latency: - [ INFO ] Median: 15.73 ms - [ INFO ] Average: 16.27 ms - [ INFO ] Min: 15.41 ms - [ INFO ] Max: 24.40 ms - [ INFO ] Throughput: 60.69 FPS + [ INFO ] Median: 15.48 ms + [ INFO ] Average: 16.33 ms + [ INFO ] Min: 15.15 ms + [ INFO ] Max: 28.31 ms + [ INFO ] Throughput: 60.49 FPS Visually Compare Inference Results @@ -905,7 +905,7 @@ seed is displayed to enable reproducing specific runs of this cell. .. parsed-literal:: - Visualizing results with seed 1732231099 + Visualizing results with seed 1733782265 @@ -989,7 +989,7 @@ performs inference, and displays the results on the frames loaded in .. parsed-literal:: Loaded model to AUTO in 0.15 seconds. - Total time for 68 frames: 2.31 seconds, fps:29.91 + Total time for 68 frames: 2.32 seconds, fps:29.70 References diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png b/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png index a0c854d6dd33f6..fc10c642d8d2a1 100644 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png +++ b/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:588fb52eb7dcf0ede69419b9645ad6dc93526e8960af83679e12bac98e6817f6 -size 385527 +oid sha256:52955890ed558e516a361399057b8529ffd5103a7b63ed20a2549062b4d900b5 +size 386283 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output.rst b/docs/notebooks/ddcolor-image-colorization-with-output.rst index cd3bf024065b55..11b92fe4897f5e 100644 --- a/docs/notebooks/ddcolor-image-colorization-with-output.rst +++ b/docs/notebooks/ddcolor-image-colorization-with-output.rst @@ -135,7 +135,7 @@ Prerequisites .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) @@ -416,10 +416,10 @@ Perform model quantization .. parsed-literal:: - 2024-11-22 00:20:47.511999: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:20:47.551328: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:13:28.920989: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:13:28.960154: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:20:47.960841: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:13:29.365051: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -527,7 +527,7 @@ Tool + @@ -314,13 +313,13 @@ loading on device using ``core.complie_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:73: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:73: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:74: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:74: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2.py:183: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2.py:183: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if npatch == N and w == h: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dpt.py:147: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dpt.py:147: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True) @@ -412,7 +411,7 @@ range. .. parsed-literal:: - + @@ -429,11 +428,10 @@ Run inference on video VIDEO_FILE = "./Coco Walking in Berkeley.mp4" - if not Path(VIDEO_FILE).exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", - VIDEO_FILE, - ) + download_file( + "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", + VIDEO_FILE, + ) # Number of seconds of input video to process. Set `NUM_SECONDS` to 0 to process # the full video. @@ -636,7 +634,7 @@ Run inference on video .. parsed-literal:: - Processed 60 frames in 13.34 seconds. Total FPS (including video processing): 4.50.Inference FPS: 10.65 + Processed 60 frames in 13.15 seconds. Total FPS (including video processing): 4.56.Inference FPS: 10.69 Video saved to 'output/Coco Walking in Berkeley_depth_anything.mp4'. @@ -663,7 +661,7 @@ Run inference on video .. parsed-literal:: Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 If you cannot see the video in your browser, please click on the following link to download the video @@ -797,10 +795,10 @@ quantization code below may take some time. .. parsed-literal:: - 2024-11-22 00:29:02.540402: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:29:02.574640: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:21:25.394147: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:21:25.427427: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:29:03.160362: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:21:26.001101: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -926,10 +924,10 @@ data. .. parsed-literal:: - Processed 60 frames in 12.91 seconds. Total FPS (including video processing): 4.65.Inference FPS: 12.59 + Processed 60 frames in 12.60 seconds. Total FPS (including video processing): 4.76.Inference FPS: 13.15 Video saved to 'output/Coco Walking in Berkeley_depth_anything_int8.mp4'. Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 If you cannot see the video in your browser, please click on the following link to download the video @@ -1009,9 +1007,9 @@ Tool =4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) - Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (1.13.3) - Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) - Requirement already satisfied: fsspec in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (2024.9.0) - Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (1.23.5) - Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (10.4.0) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) - Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch) (1.3.0) + Requirement already satisfied: torch in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2.4.1+cpu) + Requirement already satisfied: torchvision in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.19.1+cpu) + Requirement already satisfied: opencv-python in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (4.10.0.84) + Requirement already satisfied: wheel in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.45.1) + Requirement already satisfied: filelock in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.16.1) + Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) + Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (1.13.3) + Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1) + Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) + Requirement already satisfied: fsspec in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (2024.9.0) + Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (1.23.5) + Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (10.4.0) + Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) + Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch) (1.3.0) Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu Collecting git+https://github.com/facebookresearch/detectron2.git - Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-4klmx21d + Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-89enhchj .. parsed-literal:: - Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-4klmx21d + Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-89enhchj .. parsed-literal:: @@ -125,73 +125,73 @@ Install required packages for running model Resolved https://github.com/facebookresearch/detectron2.git to commit c69939aa85460e8135f40bce908a6cddaa73065f Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' - Requirement already satisfied: Pillow>=7.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (10.4.0) - Requirement already satisfied: black in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.3.0) - Requirement already satisfied: cloudpickle in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.1.0) - Requirement already satisfied: fvcore<0.1.6,>=0.1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.5.post20221221) + Requirement already satisfied: Pillow>=7.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (10.4.0) + Requirement already satisfied: black in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.3.0) + Requirement already satisfied: cloudpickle in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.1.0) + Requirement already satisfied: fvcore<0.1.6,>=0.1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.5.post20221221) Collecting hydra-core>=1.1 (from detectron2==0.6) Using cached hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB) Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6) Using cached https://download.pytorch.org/whl/iopath-0.1.9-py3-none-any.whl (27 kB) - Requirement already satisfied: matplotlib in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.7.5) + Requirement already satisfied: matplotlib in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.7.5) Collecting omegaconf<2.4,>=2.1 (from detectron2==0.6) Using cached omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.2) - Requirement already satisfied: pycocotools>=2.0.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.0.7) - Requirement already satisfied: tabulate in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.9.0) - Requirement already satisfied: tensorboard in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.12.3) - Requirement already satisfied: termcolor>=1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.4.0) - Requirement already satisfied: tqdm>4.29.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (4.67.0) - Requirement already satisfied: yacs>=0.1.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.8) - Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (1.23.5) - Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (6.0.2) - Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (4.9.3) - Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (6.4.5) - Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath<0.1.10,>=0.1.7->detectron2==0.6) (3.0.0) - Requirement already satisfied: contourpy>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.1.1) - Requirement already satisfied: cycler>=0.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (0.12.1) - Requirement already satisfied: fonttools>=4.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (4.55.0) - Requirement already satisfied: kiwisolver>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.4.7) - Requirement already satisfied: pyparsing>=2.3.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (3.1.4) - Requirement already satisfied: python-dateutil>=2.7 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (2.9.0.post0) - Requirement already satisfied: click>=8.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (8.1.7) - Requirement already satisfied: mypy-extensions>=0.4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (1.0.0) - Requirement already satisfied: pathspec>=0.9.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (0.12.1) - Requirement already satisfied: platformdirs>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.3.6) - Requirement already satisfied: tomli>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (2.1.0) - Requirement already satisfied: typing-extensions>=4.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.12.2) - Requirement already satisfied: absl-py>=0.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.4.0) - Requirement already satisfied: grpcio>=1.48.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.68.0) - Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.36.0) - Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.0.0) - Requirement already satisfied: markdown>=2.6.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.7) - Requirement already satisfied: protobuf>=3.19.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.20.3) - Requirement already satisfied: requests<3,>=2.21.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.32.3) - Requirement already satisfied: setuptools>=41.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (44.0.0) - Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.7.2) - Requirement already satisfied: werkzeug>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.0.6) - Requirement already satisfied: wheel>=0.26 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.45.0) - Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (5.5.0) - Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.4.1) - Requirement already satisfied: rsa<5,>=3.1.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (4.9) - Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (2.0.0) - Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.1->detectron2==0.6) (3.20.2) - Requirement already satisfied: importlib-metadata>=4.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from markdown>=2.6.8->tensorboard->detectron2==0.6) (8.5.0) - Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib->detectron2==0.6) (1.16.0) - Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.4.0) - Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.10) - Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2.2.3) - Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2024.8.30) - Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from werkzeug>=1.0.1->tensorboard->detectron2==0.6) (2.1.5) - Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.6.1) - Requirement already satisfied: oauthlib>=3.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (3.2.2) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.2) + Requirement already satisfied: pycocotools>=2.0.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.0.7) + Requirement already satisfied: tabulate in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.9.0) + Requirement already satisfied: tensorboard in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.12.3) + Requirement already satisfied: termcolor>=1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.4.0) + Requirement already satisfied: tqdm>4.29.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (4.67.1) + Requirement already satisfied: yacs>=0.1.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.8) + Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (1.23.5) + Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (6.0.2) + Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (4.9.3) + Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (6.4.5) + Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath<0.1.10,>=0.1.7->detectron2==0.6) (3.0.0) + Requirement already satisfied: contourpy>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.1.1) + Requirement already satisfied: cycler>=0.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (0.12.1) + Requirement already satisfied: fonttools>=4.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (4.55.2) + Requirement already satisfied: kiwisolver>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.4.7) + Requirement already satisfied: pyparsing>=2.3.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (3.1.4) + Requirement already satisfied: python-dateutil>=2.7 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (2.9.0.post0) + Requirement already satisfied: click>=8.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (8.1.7) + Requirement already satisfied: mypy-extensions>=0.4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (1.0.0) + Requirement already satisfied: pathspec>=0.9.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (0.12.1) + Requirement already satisfied: platformdirs>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.3.6) + Requirement already satisfied: tomli>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (2.2.1) + Requirement already satisfied: typing-extensions>=4.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.12.2) + Requirement already satisfied: absl-py>=0.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.4.0) + Requirement already satisfied: grpcio>=1.48.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.68.1) + Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.36.0) + Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.0.0) + Requirement already satisfied: markdown>=2.6.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.7) + Requirement already satisfied: protobuf>=3.19.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.20.3) + Requirement already satisfied: requests<3,>=2.21.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.32.3) + Requirement already satisfied: setuptools>=41.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (44.0.0) + Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.7.2) + Requirement already satisfied: werkzeug>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.0.6) + Requirement already satisfied: wheel>=0.26 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.45.1) + Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (5.5.0) + Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.4.1) + Requirement already satisfied: rsa<5,>=3.1.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (4.9) + Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (2.0.0) + Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.1->detectron2==0.6) (3.20.2) + Requirement already satisfied: importlib-metadata>=4.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from markdown>=2.6.8->tensorboard->detectron2==0.6) (8.5.0) + Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib->detectron2==0.6) (1.17.0) + Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.4.0) + Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.10) + Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2.2.3) + Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2024.8.30) + Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from werkzeug>=1.0.1->tensorboard->detectron2==0.6) (2.1.5) + Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.6.1) + Requirement already satisfied: oauthlib>=3.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (3.2.2) Using cached hydra_core-1.3.2-py3-none-any.whl (154 kB) Using cached omegaconf-2.3.0-py3-none-any.whl (79 kB) Building wheels for collected packages: detectron2 Building wheel for detectron2 (setup.py): started Building wheel for detectron2 (setup.py): finished with status 'done' - Created wheel for detectron2: filename=detectron2-0.6-cp38-cp38-linux_x86_64.whl size=8313367 sha256=4eb79589c47d522c993509a8f16dfbf494af0f494c6a73577d9d3668c1ee4a05 - Stored in directory: /tmp/pip-ephem-wheel-cache-mkdcktsx/wheels/19/ac/65/e48e5e4ec2702274d927c5a6efb75709b24014371d3bb778f2 + Created wheel for detectron2: filename=detectron2-0.6-cp38-cp38-linux_x86_64.whl size=8313251 sha256=a744a8ccf54176a60e63af7e14e6a7f431f5b19935a3c1260a7d39f7a7f84bc8 + Stored in directory: /tmp/pip-ephem-wheel-cache-cb2ga2gq/wheels/19/ac/65/e48e5e4ec2702274d927c5a6efb75709b24014371d3bb778f2 Successfully built detectron2 Installing collected packages: omegaconf, iopath, hydra-core, detectron2 Attempting uninstall: omegaconf @@ -203,10 +203,10 @@ Install required packages for running model Uninstalling iopath-0.1.10: Successfully uninstalled iopath-0.1.10 Successfully installed detectron2-0.6 hydra-core-1.3.2 iopath-0.1.9 omegaconf-2.3.0 - Requirement already satisfied: openvino>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2024.4.0) - Requirement already satisfied: numpy<2.1.0,>=1.16.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (1.23.5) - Requirement already satisfied: openvino-telemetry>=2023.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (2024.5.0) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (24.2) + Requirement already satisfied: openvino>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2024.4.0) + Requirement already satisfied: numpy<2.1.0,>=1.16.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (1.23.5) + Requirement already satisfied: openvino-telemetry>=2023.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (2024.5.0) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (24.2) Define helpers for PyTorch model initialization and conversion diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg index 2c18ecdc61719a..9ffd8dbc558859 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edc1fd6c9bb94b1ff9dde163988de0d5635f35a9cb918138eb058de89fe36b6c -size 58029 +oid sha256:ec1aaa179217e234b7c93d22f9da2f1ac0281e5bf8e4271c4094c3d680793782 +size 58047 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png index 0890e13959d7b2..0c626a2f115cc3 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b54cfa3647ce484120c2dac840789885273b1a61d0fdf6fd1fdb93e78753c114 -size 509016 +oid sha256:5ae2588579f79d5d3e23a9fd9870f28f3bd063b9166da901cd639f16f0f04fca +size 508747 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg index d2b1ec1ee92784..6063ffe4fca6ec 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ffdd1e786238678562e14aa201c2a602b1733bb7db8b1c175f7d86b3c011fa2 -size 54276 +oid sha256:d414af832026a73bf7d8a8165a202c499989ddbc4db0826e6e0ca1951b2b4605 +size 54234 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png index d970f117246904..656018b2fa8884 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b8a9ccae3ca190acfaa9ddaa9be7641e02edae972b15c49f21cf9a8de9ae454 -size 456077 +oid sha256:8aa8664400b8c51d604d23e669d6e44f766d4eb6b9958d38f4757d5e1cbefe88 +size 457666 diff --git a/docs/notebooks/dynamicrafter-animating-images-with-output.rst b/docs/notebooks/dynamicrafter-animating-images-with-output.rst index 13b4c9475f7092..fac46c9f3e6cad 100644 --- a/docs/notebooks/dynamicrafter-animating-images-with-output.rst +++ b/docs/notebooks/dynamicrafter-animating-images-with-output.rst @@ -151,6 +151,13 @@ Prerequisites %pip install -q "openvino>=2024.2.0" "nncf>=2.11.0" "datasets>=2.20.0" %pip install -q "gradio>=4.19" omegaconf einops pytorch_lightning kornia "open_clip_torch==2.22.0" transformers av opencv-python "torch==2.2.2" --extra-index-url https://download.pytorch.org/whl/cpu + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + + .. code:: ipython3 from pathlib import Path @@ -168,6 +175,15 @@ Prerequisites ) open("notebook_utils.py", "w").write(r.text) + + + +.. parsed-literal:: + + 24624 + + + .. code:: ipython3 from cmd_helper import clone_repo @@ -175,6 +191,15 @@ Prerequisites clone_repo("https://github.com/Doubiiu/DynamiCrafter.git", "26e665cd6c174234238d2ded661e2e56f875d360") + + + +.. parsed-literal:: + + PosixPath('DynamiCrafter') + + + Load and run the original pipeline ---------------------------------- @@ -238,7 +263,7 @@ We will use model for 256x256 resolution as example. Also, models for hf_hub_download(repo_id=REPO_ID, filename="model.ckpt", local_dir="./checkpoints/dynamicrafter_256_v1/", local_dir_use_symlinks=False) ckpt_path = "checkpoints/dynamicrafter_256_v1/model.ckpt" - config_file = "dynamicrafter/configs/inference_256_v1.0.yaml" + config_file = "DynamiCrafter/configs/inference_256_v1.0.yaml" config = OmegaConf.load(config_file) model_config = config.pop("model", OmegaConf.create()) model_config["params"]["unet_config"]["params"]["use_checkpoint"] = False @@ -252,11 +277,56 @@ We will use model for 256x256 resolution as example. Also, models for model = download_model() +.. parsed-literal:: + + Note: switching to '26e665cd6c174234238d2ded661e2e56f875d360'. + + You are in 'detached HEAD' state. You can look around, make experimental + changes and commit them, and you can discard any commits you make in this + state without impacting any branches by switching back to a branch. + + If you want to create a new branch to retain commits you create, you may + do so (now or later) by using -c with the switch command. Example: + + git switch -c + + Or undo this operation with: + + git switch - + + Turn off this advice by setting config variable advice.detachedHead to false + + HEAD is now at 26e665c add dataset + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/huggingface_hub/file_download.py:1204: UserWarning: `local_dir_use_symlinks` parameter is deprecated and will be ignored. The process to download files to a local folder has been updated and do not rely on symlinks anymore. You only need to pass a destination folder as`local_dir`. + For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder. + warnings.warn( + + + +.. parsed-literal:: + + model.ckpt: 0%| | 0.00/10.4G [00:00>> model checkpoint loaded. - + Convert the model to OpenVINO IR -------------------------------- @@ -333,6 +403,17 @@ Convert CLIP text encoder del cond_stage_model gc.collect(); + +.. parsed-literal:: + + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + +.. parsed-literal:: + + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + Convert CLIP image encoder ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -355,6 +436,49 @@ resolutions. del model.embedder gc.collect(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/utils/image.py:226: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if input.numel() == 0: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:573: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if size == input_size: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:579: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + antialias = antialias and (max(factors) > 1) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:581: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if antialias: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:584: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + sigmas = (max((factors[0] - 1.0) / 2.0, 0.001), max((factors[1] - 1.0) / 2.0, 0.001)) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:589: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:589: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/gaussian.py:55: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + sigma = tensor([sigma], device=input.device, dtype=input.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/gaussian.py:55: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + sigma = tensor([sigma], device=input.device, dtype=input.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/core/check.py:78: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if x_shape_to_check[i] != dim: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/kernels.py:92: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + mean = tensor([[mean]], device=sigma.device, dtype=sigma.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:101: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if len(mean.shape) == 0 or mean.shape[0] == 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if len(std.shape) == 0 or std.shape[0] == 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:107: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if mean.shape and mean.shape[0] != 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:108: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if mean.shape[0] != data.shape[1] and mean.shape[:2] != data.shape[:2]: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:112: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if std.shape and std.shape[0] != 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:113: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if std.shape[0] != data.shape[1] and std.shape[:2] != data.shape[:2]: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:116: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + mean = torch.as_tensor(mean, device=data.device, dtype=data.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:117: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + std = torch.as_tensor(std, device=data.device, dtype=data.dtype) + + Convert AE encoder ~~~~~~~~~~~~~~~~~~ @@ -377,6 +501,13 @@ Convert AE encoder del model.first_stage_model.encoder gc.collect(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/ae_modules.py:67: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + w_ = w_ * (int(c)**(-0.5)) + + Convert Diffusion U-Net model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -414,6 +545,21 @@ Convert Diffusion U-Net model del model.model.diffusion_model gc.collect(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:556: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if l_context == 77 + t*16: ## !!! HARD CODE here + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:205: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if batch_size: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:232: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if self.use_temporal_conv and batch_size: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:76: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + assert x.shape[1] == self.channels + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:99: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + assert x.shape[1] == self.channels + + Convert AE decoder ~~~~~~~~~~~~~~~~~~ @@ -785,7 +931,7 @@ Run OpenVINO pipeline inference .. code:: ipython3 - image_path = "dynamicrafter/prompts/256/art.png" + image_path = "DynamiCrafter/prompts/256/art.png" prompt = "man fishing in a boat at sunset" seed = 234 image = Image.open(image_path) @@ -797,15 +943,15 @@ Run OpenVINO pipeline inference .. parsed-literal:: Seed set to 234 - /tmp/ipykernel_971108/2451984876.py:25: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.) + /tmp/ipykernel_2173449/2451984876.py:25: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.) img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device) - + .. parsed-literal:: - start: man fishing in a boat at sunset 2024-08-06 13:54:24 - Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 164.28 seconds - + start: man fishing in a boat at sunset 2024-12-09 23:46:36 + Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 194.37 seconds + .. code:: ipython3 @@ -828,7 +974,7 @@ Run OpenVINO pipeline inference - + @@ -1000,6 +1146,19 @@ To collect intermediate model inputs for calibration we should customize 0%| | 0/300 [00:00>> model checkpoint loaded. - + .. code:: ipython3 %%skip not $to_quantize.value - image_path = "dynamicrafter/prompts/256/art.png" + image_path = "DynamiCrafter/prompts/256/art.png" prompt = "man fishing in a boat at sunset" seed = 234 image = Image.open(image_path) @@ -1317,13 +1375,13 @@ Let’s run the optimized pipeline .. parsed-literal:: Seed set to 234 - + .. parsed-literal:: - start: man fishing in a boat at sunset 2024-08-06 15:09:26 - Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 81.47 seconds - + start: man fishing in a boat at sunset 2024-12-10 01:17:34 + Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 98.80 seconds + .. code:: ipython3 @@ -1345,7 +1403,7 @@ Let’s run the optimized pipeline - + Compare model file sizes @@ -1373,7 +1431,7 @@ Compare model file sizes encoder_first_stage_ir compression rate: 3.986 embedder_ir compression rate: 3.977 model_ir compression rate: 3.981 - + Compare inference time of the FP32 and INT8 models ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1427,10 +1485,10 @@ models, we use median inference time on calibration subset. .. parsed-literal:: - FP32 latency: 162.304 - INT8 latency: 79.590 - Performance speed up: 2.039 - + FP32 latency: 193.524 + INT8 latency: 97.073 + Performance speed up: 1.994 + Interactive inference --------------------- @@ -1454,6 +1512,15 @@ to launch the interactive demo. use_quantized_models + + + +.. parsed-literal:: + + Checkbox(value=True, description='Use quantized models') + + + .. code:: ipython3 from functools import partial @@ -1472,9 +1539,23 @@ to launch the interactive demo. demo = make_demo(fn=get_image_fn) try: - demo.queue().launch(debug=True) + demo.queue().launch(debug=False) except Exception: - demo.queue().launch(debug=True, share=True) + demo.queue().launch(debug=False, share=True) # if you are launching remotely, specify server_name and server_port # demo.launch(server_name='your server name', server_port='server port in int') # Read more in the docs: https://gradio.app/docs/ + + +.. parsed-literal:: + + Running on local URL: http://127.0.0.1:7860 + + To create a public link, set `share=True` in `launch()`. + + + + + + + diff --git a/docs/notebooks/efficient-sam-with-output.rst b/docs/notebooks/efficient-sam-with-output.rst index 2341db94e22f68..ce83a3675d1d8c 100644 --- a/docs/notebooks/efficient-sam-with-output.rst +++ b/docs/notebooks/efficient-sam-with-output.rst @@ -82,11 +82,22 @@ Prerequisites .. code:: ipython3 - %pip install -q "openvino>=2023.3.0" "nncf>=2.7.0" opencv-python "gradio>=4.13" "matplotlib>=3.4" torch torchvision tqdm --extra-index-url https://download.pytorch.org/whl/cpu + import platform + + %pip install -q "openvino>=2024.5.0" "nncf>=2.14.0" + %pip install -q "torch>=2.2.0" "torchaudio>=2.2.0" "torchvision>=0.17.0" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q opencv-python "gradio>=4.13" "matplotlib>=3.4" tqdm + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" .. parsed-literal:: + ERROR: Could not find a version that satisfies the requirement openvino>=2024.5.0 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0, 2024.1.0, 2024.2.0, 2024.3.0, 2024.4.0, 2024.4.1.dev20240926) + ERROR: No matching distribution found for openvino>=2024.5.0 + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -130,7 +141,7 @@ Prerequisites .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM Load PyTorch model @@ -203,6 +214,13 @@ build PyTorch model pt_model.eval(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:303: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + state_dict = torch.load(f, map_location="cpu") + + Run PyTorch model inference --------------------------- @@ -385,23 +403,23 @@ disk using ``openvino.save_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:220: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:220: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:241: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:241: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:163: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:163: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! size = int(math.sqrt(xy_num)) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert size * size == xy_num - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if size != h or size != w: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:251: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:251: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert x.shape[2] == num_patches - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if num_pts > self.decoder_max_num_input_points: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:92: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:92: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! elif num_pts < self.decoder_max_num_input_points: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:126: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:126: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if output_w > 0 and output_h > 0: @@ -648,10 +666,10 @@ architecture type, we should specify ``transformer`` in ``model_type``. .. parsed-literal:: - 2024-11-22 00:51:57.265752: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:51:57.297997: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 01:35:21.740526: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:35:21.772231: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:51:57.938257: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:35:22.412391: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -818,7 +836,7 @@ models, we use ``bencmark_app``. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 29.71 ms + [ INFO ] Read model took 29.92 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] batched_images (node: batched_images) : f32 / [...] / [?,?,?,?] @@ -838,7 +856,7 @@ models, we use ``bencmark_app``. [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_3) : f32 / [...] / [?,?,?,?,?] [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_2) : f32 / [...] / [?,?,?] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1398.31 ms + [ INFO ] Compile model took 1396.28 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -879,17 +897,17 @@ models, we use ``bencmark_app``. [ INFO ] Fill input 'batched_point_labels' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) [ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop). - [ INFO ] First inference took 793.15 ms + [ INFO ] First inference took 850.98 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 55 iterations - [ INFO ] Duration: 17124.15 ms + [ INFO ] Count: 49 iterations + [ INFO ] Duration: 16117.80 ms [ INFO ] Latency: - [ INFO ] Median: 1829.15 ms - [ INFO ] Average: 1806.67 ms - [ INFO ] Min: 872.57 ms - [ INFO ] Max: 2037.03 ms - [ INFO ] Throughput: 3.21 FPS + [ INFO ] Median: 1890.12 ms + [ INFO ] Average: 1899.68 ms + [ INFO ] Min: 1013.52 ms + [ INFO ] Max: 2315.56 ms + [ INFO ] Throughput: 3.04 FPS .. code:: ipython3 @@ -915,7 +933,7 @@ models, we use ``bencmark_app``. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 43.85 ms + [ INFO ] Read model took 43.16 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] batched_images (node: batched_images) : f32 / [...] / [?,?,?,?] @@ -935,7 +953,7 @@ models, we use ``bencmark_app``. [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_3) : f32 / [...] / [?,?,?,?,?] [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_2) : f32 / [...] / [?,?,?] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1631.76 ms + [ INFO ] Compile model took 1639.65 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -976,17 +994,17 @@ models, we use ``bencmark_app``. [ INFO ] Fill input 'batched_point_labels' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) [ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop). - [ INFO ] First inference took 583.55 ms + [ INFO ] First inference took 586.73 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 56 iterations - [ INFO ] Duration: 16266.69 ms + [ INFO ] Count: 55 iterations + [ INFO ] Duration: 15880.08 ms [ INFO ] Latency: - [ INFO ] Median: 1710.59 ms - [ INFO ] Average: 1692.97 ms - [ INFO ] Min: 713.08 ms - [ INFO ] Max: 1952.47 ms - [ INFO ] Throughput: 3.44 FPS + [ INFO ] Median: 1710.19 ms + [ INFO ] Average: 1694.56 ms + [ INFO ] Min: 569.82 ms + [ INFO ] Max: 1827.81 ms + [ INFO ] Throughput: 3.46 FPS Interactive segmentation demo @@ -1316,7 +1334,7 @@ Interactive segmentation demo .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam Running on local URL: http://127.0.0.1:7860 To create a public link, set `share=True` in `launch()`. diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png index f9dfb53e3b8796..ee488196e09a35 100644 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png +++ b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cffb9233e156bb558299a8c9bd3931dad6999f9bf7f358b208549949411460d1 -size 1259114 +oid sha256:c724c8a2e1ea229d28fc4828d1e0f8e3709b56e66b4568cd5c300123a6b6990b +size 1259642 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png index 108e6e0e4564e0..25a70458403cd0 100644 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png +++ b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5760726cd720e435c5d3a85315e772a741d583553996d8cfe7833f5d941e79f3 -size 1260778 +oid sha256:8086bb37d6a8400d681ce701a0ccd8aca10ef94cbb1d2fd387ae08f06e26342a +size 1262788 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png index c767ab3d6193bd..cb5a9e6e89c825 100644 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png +++ b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3909739937c5c50e2b26b3cba0b8b30e98e13fee3eab6c4f382735ec82ae9250 -size 1261525 +oid sha256:a18bb4842ab402d752631d693ed64876b58b8cd3cff35bbb3342ba67b35f2c30 +size 1260902 diff --git a/docs/notebooks/encodec-audio-compression-with-output.rst b/docs/notebooks/encodec-audio-compression-with-output.rst index 4cf2479f638656..4d10def61a4a57 100644 --- a/docs/notebooks/encodec-audio-compression-with-output.rst +++ b/docs/notebooks/encodec-audio-compression-with-output.rst @@ -72,8 +72,6 @@ Install required dependencies: .. parsed-literal:: - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - torchvision 0.17.2+cpu requires torch==2.2.2, but you have torch 2.4.1+cpu which is incompatible. Note: you may need to restart the kernel to use updated packages. @@ -142,7 +140,7 @@ bandwidth. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. WeightNorm.apply(module, name, dim) @@ -302,7 +300,7 @@ similar as possible to the original. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. WeightNorm.apply(module, name, dim) @@ -402,13 +400,13 @@ with ``ov.save_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:60: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:60: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:87: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:87: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! max_pad = max(padding_left, padding_right) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:89: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:89: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if length <= max_pad: @@ -428,11 +426,11 @@ with ``ov.save_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:358: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:358: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. quantized_out = torch.tensor(0.0, device=q_indices.device) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:359: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:359: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results). for i, indices in enumerate(q_indices): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert (padding_left + padding_right) <= x.shape[-1] diff --git a/docs/notebooks/fast-segment-anything-with-output.rst b/docs/notebooks/fast-segment-anything-with-output.rst index 9becf2719559bc..0071e2dca60e74 100644 --- a/docs/notebooks/fast-segment-anything-with-output.rst +++ b/docs/notebooks/fast-segment-anything-with-output.rst @@ -85,8 +85,6 @@ Install requirements .. parsed-literal:: - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - torchaudio 2.4.1+cpu requires torch==2.4.1, but you have torch 2.2.2+cpu which is incompatible. Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -158,7 +156,9 @@ model and generate a segmentation map. .. parsed-literal:: - 100%|██████████| 138M/138M [00:02<00:00, 48.9MB/s] + 100%|██████████| 138M/138M [00:03<00:00, 46.3MB/s] + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/ultralytics/nn/tasks.py:732: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + ckpt = torch.load(file, map_location="cpu") @@ -170,8 +170,8 @@ model and generate a segmentation map. .. parsed-literal:: - image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 768x1024 37 objects, 642.9ms - Speed: 3.9ms preprocess, 642.9ms inference, 771.9ms postprocess per image at shape (1, 3, 768, 1024) + image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 768x1024 37 objects, 638.3ms + Speed: 3.4ms preprocess, 638.3ms inference, 500.4ms postprocess per image at shape (1, 3, 768, 1024) The model returns segmentation maps for all the objects on the image. @@ -209,15 +209,15 @@ tracing. The FastSAM model itself is based on YOLOv8 model. .. parsed-literal:: - Ultralytics YOLOv8.2.24 🚀 Python-3.8.10 torch-2.2.2+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) + Ultralytics YOLOv8.2.24 🚀 Python-3.8.10 torch-2.4.1+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) PyTorch: starting from 'FastSAM-x.pt' with input shape (1, 3, 1024, 1024) BCHW and output shape(s) ((1, 37, 21504), (1, 32, 256, 256)) (138.3 MB) OpenVINO: starting export with openvino 2024.4.0-16579-c3152d32c9c-releases/2024/4... - OpenVINO: export success ✅ 6.1s, saved as 'FastSAM-x_openvino_model/' (276.1 MB) + OpenVINO: export success ✅ 6.2s, saved as 'FastSAM-x_openvino_model/' (276.1 MB) - Export complete (9.1s) - Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything + Export complete (9.2s) + Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything Predict: yolo predict task=segment model=FastSAM-x_openvino_model imgsz=1024 Validate: yolo val task=segment model=FastSAM-x_openvino_model imgsz=1024 data=ultralytics/datasets/sa.yaml Visualize: https://netron.app @@ -321,8 +321,8 @@ pipeline. .. parsed-literal:: - image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 1024x1024 42 objects, 494.2ms - Speed: 6.6ms preprocess, 494.2ms inference, 30.3ms postprocess per image at shape (1, 3, 1024, 1024) + image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 1024x1024 42 objects, 498.5ms + Speed: 6.1ms preprocess, 498.5ms inference, 31.6ms postprocess per image at shape (1, 3, 1024, 1024) One can observe the converted model outputs in the next cell, they is @@ -521,6 +521,11 @@ repo <-with-output.html>`__. preset=nncf.QuantizationPreset.MIXED) +.. parsed-literal:: + + :7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console) + + .. parsed-literal:: INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino @@ -615,8 +620,8 @@ calibration dataset to measure the performance. .. parsed-literal:: - Segmented in 72 seconds. - Resulting in 1.78 fps + Segmented in 68 seconds. + Resulting in 1.88 fps .. code:: ipython3 @@ -643,9 +648,9 @@ calibration dataset to measure the performance. .. parsed-literal:: - Segmented in 23 seconds - Resulting in 5.57 fps - That is 3.13 times faster! + Segmented in 21 seconds + Resulting in 6.1 fps + That is 3.24 times faster! Try out the converted pipeline diff --git a/docs/notebooks/florence2-with-output.rst b/docs/notebooks/florence2-with-output.rst index e4ab6fbcbd3a3b..7ec9ce6e6557ca 100644 --- a/docs/notebooks/florence2-with-output.rst +++ b/docs/notebooks/florence2-with-output.rst @@ -100,10 +100,10 @@ available model. By default, we will use .. parsed-literal:: - 2024-11-22 01:05:34.426758: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:05:34.462006: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 01:48:13.363088: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:48:13.396921: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 01:05:35.115966: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:48:14.055295: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -193,31 +193,31 @@ pipeline. .. parsed-literal:: - config.json: 0%| | 0.00/2.43k [00:00 1 or self.sliding_window is not None: /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:1205: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False diff --git a/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png b/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png index c233468fe95f4e..0ffc56ebd94d65 100644 --- a/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png +++ b/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d15ed97d6e50919caff2aee785bc4c90f91dcfcc9bb248f70e9d79bb203be64f -size 259663 +oid sha256:552934f1e05cf6d598ce249bb662530388c1f3335dc2a6af6c304825c8aa023a +size 259656 diff --git a/docs/notebooks/freevc-voice-conversion-with-output.rst b/docs/notebooks/freevc-voice-conversion-with-output.rst index eb1dffbcf5da08..69a935f4c4f78d 100644 --- a/docs/notebooks/freevc-voice-conversion-with-output.rst +++ b/docs/notebooks/freevc-voice-conversion-with-output.rst @@ -133,8 +133,8 @@ Install extra requirements Downloading... From: https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU&confirm=t&uuid=a703c43c-ccce-436c-8799-c11b88e9e7e4 - To: /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/WavLM-Large.pt - 100%|██████████| 1.26G/1.26G [00:26<00:00, 47.5MB/s] + To: /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/WavLM-Large.pt + 100%|██████████| 1.26G/1.26G [01:03<00:00, 19.9MB/s] .. code:: ipython3 @@ -153,7 +153,7 @@ Install extra requirements .. parsed-literal:: - checkpoints/freevc.pth: 0%| | 0.00/451M [00:00 - + Your browser does not support the audio element. diff --git a/docs/notebooks/glm-edge-v-with-output.rst b/docs/notebooks/glm-edge-v-with-output.rst new file mode 100644 index 00000000000000..2449d414d82594 --- /dev/null +++ b/docs/notebooks/glm-edge-v-with-output.rst @@ -0,0 +1,516 @@ +Visual-language assistant with GLM-Edge-V and OpenVINO +------------------------------------------------------ + +The +`GLM-Edge `__ +series is `Zhipu `__\ ’s attempt to meet +real-world deployment scenarios for edge devices. It consists of two +sizes of large language dialogue models and multimodal understanding +models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, +GLM-Edge-V-5B). Among them, the 1.5B / 2B models are mainly targeted at +platforms like mobile phones and car machines, while the 4B / 5B models +are aimed at platforms like PCs. Based on the technological advancements +of the GLM-4 series, some targeted adjustments have been made to the +model structure and size, balancing model performance, real-world +inference efficiency, and deployment convenience. Through deep +collaboration with partner enterprises and relentless efforts in +inference optimization, the GLM-Edge series models can run at extremely +high speeds on some edge platforms. + +In this tutorial we consider how to launch multimodal model GLM-Edge-V +using OpenVINO for creation multimodal chatbot. Additionally, we +optimize model to low precision using +`NNCF `__ + +**Table of contents:** + +- `Prerequisites <#prerequisites>`__ +- `Select Model <#select-model>`__ +- `Convert and Optimize model <#convert-and-optimize-model>`__ + + - `Compress model weights to + 4-bit <#compress-model-weights-to-4-bit>`__ + +- `Select inference device <#select-inference-device>`__ +- `Run OpenVINO model <#run-openvino-model>`__ +- `Interactive demo <#interactive-demo>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +install required packages and setup helper functions. + +.. code:: ipython3 + + %pip install -q "torch>=2.1" "torchvision" "protobuf>=3.20" "gradio>=4.26" "Pillow" "accelerate" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "openvino>=2024.5.0" "nncf>=2.14.0" + + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + ERROR: Could not find a version that satisfies the requirement openvino>=2024.5.0 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0, 2024.1.0, 2024.2.0, 2024.3.0, 2024.4.0, 2024.4.1.dev20240926) + ERROR: No matching distribution found for openvino>=2024.5.0 + Note: you may need to restart the kernel to use updated packages. + + +.. code:: ipython3 + + %pip install -q "git+https://github.com/huggingface/transformers" + + +.. parsed-literal:: + + error: subprocess-exited-with-error + + × Preparing metadata (pyproject.toml) did not run successfully. + │ exit code: 1 + ╰─> [6 lines of output] + + Cargo, the Rust package manager, is not installed or is not on PATH. + This package requires Rust and Cargo to compile extensions. Install it through + the system's package manager or via https://rustup.rs/ + + Checking for Rust toolchain.... + [end of output] + + note: This error originates from a subprocess, and is likely not a problem with pip. + error: metadata-generation-failed + + × Encountered error while generating package metadata. + ╰─> See above for output. + + note: This is an issue with the package mentioned above, not pip. + hint: See above for details. + Note: you may need to restart the kernel to use updated packages. + + +.. code:: ipython3 + + import requests + from pathlib import Path + + if not Path("glmv_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/glm-edge-v/glmv_helper.py") + open("glmv_helper.py", "w").write(r.text) + + + if not Path("gradio_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/glm-edge-v/gradio_helper.py") + open("gradio_helper.py", "w").write(r.text) + + if not Path("notebook_utils.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") + open("notebook_utils.py", "w").write(r.text) + +Select Model +------------ + + + +The tutorial supports the following models from GLM-Edge-V model family: + +- `glm-edge-v-2b `__ +- `glm-edge-v-5b `__ + +You can select one from the provided options below. + +.. code:: ipython3 + + import ipywidgets as widgets + + # Select model + model_ids = [ + "THUDM/glm-edge-v-2b", + "THUDM/glm-edge-v-5b", + ] + + model_dropdown = widgets.Dropdown( + options=model_ids, + value=model_ids[0], + description="Model:", + disabled=False, + ) + + model_dropdown + + + + +.. parsed-literal:: + + Dropdown(description='Model:', options=('THUDM/glm-edge-v-2b', 'THUDM/glm-edge-v-5b'), value='THUDM/glm-edge-v… + + + +Convert and Optimize model +-------------------------- + + + +GLM-Edge-V is PyTorch model. OpenVINO supports PyTorch models via +conversion to OpenVINO Intermediate Representation (IR). `OpenVINO model +conversion +API `__ +should be used for these purposes. ``ov.convert_model`` function accepts +original PyTorch model instance and example input for tracing and +returns ``ov.Model`` representing this model in OpenVINO framework. +Converted model can be used for saving on disk using ``ov.save_model`` +function or directly loading on device using ``core.complie_model``. + +The script ``glmv_helper.py`` contains helper function for model +conversion, please check its content if you interested in conversion +details. + +.. raw:: html + +
+ +Click here for more detailed explanation of conversion steps GLM-Edge-V +is autoregressive transformer generative model, it means that each next +model step depends from model output from previous step. The generation +approach is based on the assumption that the probability distribution of +a word sequence can be decomposed into the product of conditional next +word distributions. In other words, model predicts the next token in the +loop guided by previously generated tokens until the stop-condition will +be not reached (generated sequence of maximum length or end of string +token obtained). The way the next token will be selected over predicted +probabilities is driven by the selected decoding methodology. You can +find more information about the most popular decoding methods in this +blog. The entry point for the generation process for models from the +Hugging Face Transformers library is the ``generate`` method. You can +find more information about its parameters and configuration in the +documentation. To preserve flexibility in the selection decoding +methodology, we will convert only model inference for one step. + +GLM-Edge-V model consists of 3 parts: + +- **Vision Model** for encoding input images into embedding space. +- **Embedding Model** for conversion input text tokens into embedding + space +- **Language Model** for generation answer based on input embeddings + provided by Image Encoder and Input Embedding models. + +.. raw:: html + +
+ +Compress model weights to 4-bit +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For reducing memory +consumption, weights compression optimization can be applied using +`NNCF `__. + +.. raw:: html + +
+ +Click here for more details about weight compression Weight compression +aims to reduce the memory footprint of a model. It can also lead to +significant performance improvement for large memory-bound models, such +as Large Language Models (LLMs). LLMs and other models, which require +extensive memory to store the weights during inference, can benefit from +weight compression in the following ways: + +- enabling the inference of exceptionally large models that cannot be + accommodated in the memory of the device; + +- improving the inference performance of the models by reducing the + latency of the memory access when computing the operations with + weights, for example, Linear layers. + +`Neural Network Compression Framework +(NNCF) `__ provides 4-bit / +8-bit mixed weight quantization as a compression method primarily +designed to optimize LLMs. The main difference between weights +compression and full model quantization (post-training quantization) is +that activations remain floating-point in the case of weights +compression which leads to a better accuracy. Weight compression for +LLMs provides a solid inference performance improvement which is on par +with the performance of the full model quantization. In addition, weight +compression is data-free and does not require a calibration dataset, +making it easy to use. + +``nncf.compress_weights`` function can be used for performing weights +compression. The function accepts an OpenVINO model and other +compression parameters. Compared to INT8 compression, INT4 compression +improves performance even more, but introduces a minor drop in +prediction quality. + +More details about weights compression, can be found in `OpenVINO +documentation `__. + +.. raw:: html + +
+ +.. code:: ipython3 + + from pathlib import Path + import nncf + from glmv_helper import convert_glmv_model + + + model_id = model_dropdown.value + out_dir = Path("model") / Path(model_id).name / "INT4" + compression_configuration = { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "group_size": 64, + "ratio": 0.6, + } + convert_glmv_model(model_id, out_dir, compression_configuration) + + +.. parsed-literal:: + + INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino + + +.. parsed-literal:: + + 2024-12-10 01:51:54.756921: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:51:54.790860: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2024-12-10 01:51:55.339388: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + + +.. parsed-literal:: + + ⌛ glm-edge-v-2b conversion started. Be patient, it may takes some time. + ⌛ Load Original model + ✅ Original model successfully loaded + ⌛ Convert Input embedding model + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + +.. parsed-literal:: + + [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + warnings.warn( + `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. + + +.. parsed-literal:: + + ✅ Input embedding model successfully converted + ⌛ Convert Image embedding model + + +.. parsed-literal:: + + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/siglip.py:48: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + grid_size = int(s**0.5) + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/siglip.py:53: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + image_emb = torch.cat([self.boi.repeat(len(image_emb), 1, 1), image_emb, self.eoi.repeat(len(image_emb), 1, 1)], dim=1) + + +.. parsed-literal:: + + ✅ Image embedding model successfully converted + ⌛ Convert Language model + + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + or len(self.key_cache[layer_idx]) == 0 # the layer has no cache + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/modeling_glm.py:995: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if sequence_length != 1: + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/modeling_glm.py:153: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + rotary_dim = int(q.shape[-1] * partial_rotary_factor) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/modeling_glm.py:249: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:168: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) + if a.grad is not None: + + +.. parsed-literal:: + + ✅ Language model successfully converted + ⌛ Weights compression with int4_sym mode started + + + +.. parsed-literal:: + + Output() + + + + + + + + + +.. parsed-literal:: + + INFO:nncf:Statistics of the bitwidth distribution: + ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ + │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ + ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ + │ 8 │ 45% (115 / 169) │ 40% (114 / 168) │ + ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ + │ 4 │ 55% (54 / 169) │ 60% (54 / 168) │ + ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + + + +.. parsed-literal:: + + Output() + + + + + + + + + +.. parsed-literal:: + + ✅ Weights compression finished + ✅ glm-edge-v-2b model conversion finished. You can find results in model/glm-edge-v-2b/INT4 + + +Select inference device +----------------------- + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget(default="AUTO", exclude=["NPU"]) + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') + + + +Run OpenVINO model +------------------ + + + +``OvGLMv`` class provides convenient way for running model. It accepts +directory with converted model and inference device as arguments. For +running model we will use ``generate`` method. + +.. code:: ipython3 + + from glmv_helper import OvGLMv + + model = OvGLMv(out_dir, device.value) + +.. code:: ipython3 + + import requests + from PIL import Image + + url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" + image = Image.open(requests.get(url, stream=True).raw) + + query = "Please describe this picture" + + print(f"Question:\n {query}") + image + + +.. parsed-literal:: + + Question: + Please describe this picture + + + + +.. image:: glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png + + + +.. code:: ipython3 + + from transformers import TextStreamer, AutoImageProcessor, AutoTokenizer + import torch + + messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}] + + processor = AutoImageProcessor.from_pretrained(out_dir, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(out_dir, trust_remote_code=True) + inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_dict=True, tokenize=True, return_tensors="pt").to("cpu") + generate_kwargs = { + **inputs, + "pixel_values": torch.tensor(processor(image).pixel_values).to("cpu"), + "max_new_tokens": 100, + "do_sample": True, + "top_k": 20, + "streamer": TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True), + } + + print("Answer:") + output = model.generate(**generate_kwargs) + + +.. parsed-literal:: + + Answer: + The image depicts a cat resting inside a cardboard box placed on a soft carpeted floor. The cat is lying with its head towards the bottom of the box, and its front paws are stretched out with the right one slightly forward, while its back and hind legs are positioned in the box. The box appears to be in partial disassembly, with the flaps folded down and one side raised slightly off the ground. The cat's fur is well-groomed and + + +Interactive demo +---------------- + + + +.. code:: ipython3 + + from gradio_helper import make_demo + + demo = make_demo(model, processor, tokenizer) + + try: + demo.launch(debug=False, height=600) + except Exception: + demo.launch(debug=False, share=True, height=600) + # if you are launching remotely, specify server_name and server_port + # demo.launch(server_name='your server name', server_port='server port in int') + # Read more in the docs: https://gradio.app/docs/ + + +.. parsed-literal:: + + Running on local URL: http://127.0.0.1:7860 + + To create a public link, set `share=True` in `launch()`. + + + + + + + diff --git a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.jpg b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.jpg new file mode 100644 index 00000000000000..c6aeec77cd3cb2 --- /dev/null +++ b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e +size 60425 diff --git a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png new file mode 100644 index 00000000000000..c6673a757ab5dc --- /dev/null +++ b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 +size 854224 diff --git a/docs/notebooks/grounded-segment-anything-with-output.rst b/docs/notebooks/grounded-segment-anything-with-output.rst index a51ce8249239f9..6449fb1a6a9507 100644 --- a/docs/notebooks/grounded-segment-anything-with-output.rst +++ b/docs/notebooks/grounded-segment-anything-with-output.rst @@ -201,7 +201,7 @@ Download checkpoints and load PyTorch models .. parsed-literal:: - checkpoints/groundingdino_swint_ogc.pth: 0%| | 0.00/662M [00:00 + @@ -215,7 +215,7 @@ Do Inference .. parsed-literal:: - + diff --git a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_1.png b/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_1.png new file mode 100644 index 00000000000000..3677caabff4380 --- /dev/null +++ b/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76113c575caa9c8a8aca45d3ec6ebd7a4b513dadffd8e9e63861a7a041d7e5de +size 249032 diff --git a/docs/notebooks/hello-world-with-output.rst b/docs/notebooks/hello-world-with-output.rst index 5bd1216db29701..94d6dca5798876 100644 --- a/docs/notebooks/hello-world-with-output.rst +++ b/docs/notebooks/hello-world-with-output.rst @@ -98,13 +98,13 @@ Download the Model and data samples .. parsed-literal:: - artifacts/v3-small_224_1.0_float.xml: 0%| | 0.00/294k [00:00=4.33.0" "torch>=2.1.0" %pip install -q ipywidgets - %pip install -q "openvino>=2023.1.0" + %pip install -q "openvino>=2023.1.0" "Pillow" .. parsed-literal:: @@ -132,10 +132,10 @@ tutorials `__. from optimum.intel.openvino import OVModelForSequenceClassification - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - Initialize and Convert the Model Automatically using OVModel class ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -436,7 +427,7 @@ Full list of supported arguments available via ``--help`` .. parsed-literal:: - 2024-11-22 01:15:03.858078: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:57:20.152345: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] @@ -445,8 +436,10 @@ Full list of supported arguments available via ``--help`` [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] + [--backup-precision {none,int8_sym,int8_asym}] [--dataset DATASET] [--all-layers] [--awq] [--scale-estimation] [--gptq] + [--lora-correction] [--sensitivity-metric SENSITIVITY_METRIC] [--num-samples NUM_SAMPLES] [--disable-stateful] @@ -467,20 +460,20 @@ Full list of supported arguments available via ``--help`` --task TASK The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among: - ['audio-xvector', 'image-text-to-text', 'mask- - generation', 'text-generation', 'masked-im', 'image- - classification', 'token-classification', 'question- - answering', 'automatic-speech-recognition', 'multiple- - choice', 'image-segmentation', 'semantic- - segmentation', 'text2text-generation', 'feature- - extraction', 'image-to-text', 'text-to-audio', 'text- - to-image', 'zero-shot-object-detection', 'inpainting', - 'zero-shot-image-classification', 'object-detection', - 'text-classification', 'image-to-image', 'sentence- - similarity', 'audio-frame-classification', 'depth- - estimation', 'audio-classification', 'fill-mask']. For - decoder models, use `xxx-with-past` to export the - model using past key values in the decoder. + ['zero-shot-object-detection', 'multiple-choice', + 'audio-xvector', 'masked-im', 'text2text-generation', + 'inpainting', 'image-segmentation', 'semantic- + segmentation', 'question-answering', 'token- + classification', 'audio-frame-classification', + 'feature-extraction', 'text-to-audio', 'image-to- + image', 'fill-mask', 'automatic-speech-recognition', + 'image-classification', 'text-classification', 'zero- + shot-image-classification', 'object-detection', + 'image-to-text', 'audio-classification', 'sentence- + similarity', 'depth-estimation', 'text-to-image', + 'mask-generation', 'text-generation']. For decoder + models, use `xxx-with-past` to export the model using + past key values in the decoder. --framework {pt,tf} The framework to use for the export. If not provided, will attempt to use the local checkpoint's original framework or what is available in the environment. @@ -514,12 +507,27 @@ Full list of supported arguments available via ``--help`` --group-size GROUP_SIZE The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + --backup-precision {none,int8_sym,int8_asym} + Defines a backup precision for mixed-precision weight + compression. Only valid for int4 weight format. If not + provided, backup precision is int8_asym. 'none' stands + for original floating-point precision of the model + weights, in this case weights are retained in their + original precision without any quantization. + 'int8_sym' stands for 8-bit integer symmetric + quantization without zero point. 'int8_asym' stands + for 8-bit integer asymmetric quantization with zero + points per each quantization group. --dataset DATASET The dataset used for data-aware compression or - quantization with NNCF. You can use the one from the - list ['wikitext2','c4','c4-new'] for language models - or ['conceptual_captions','laion/220k-GPT4Vision- - captions-from-LIVIS','laion/filtered-wit'] for - diffusion models. + quantization with NNCF. For language models you can + use the one from the list + ['auto','wikitext2','c4','c4-new']. With 'auto' the + dataset will be collected from model's generations. + For diffusion models it should be on of + ['conceptual_captions','laion/220k-GPT4Vision- + captions-from-LIVIS','laion/filtered-wit']. For visual + language models the dataset must be set to + 'contextual'. --all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight compression is applied, they are compressed to INT8. @@ -527,7 +535,7 @@ Full list of supported arguments available via ``--help`` generation quality of INT4-compressed LLMs, but requires additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a - dataset argument. Note: it's possible that there will + dataset argument. Note: it is possible that there will be no matching patterns in the model to apply AWQ, in such case it will be skipped. --scale-estimation Indicates whether to apply a scale estimation @@ -541,9 +549,15 @@ Full list of supported arguments available via ``--help`` to minimize the difference between activations of a compressed and original layer. Please note, that applying GPTQ takes additional memory and time. + --lora-correction Indicates whether to apply LoRA Correction algorithm. + When enabled, this algorithm introduces low-rank + adaptation layers in the model that can recover + accuracy after weight compression at some cost of + inference latency. Please note, that applying LoRA + Correction algorithm takes additional memory and time. --sensitivity-metric SENSITIVITY_METRIC The sensitivity metric for assigning quantization - precision to layers. Can be one of the following: + precision to layers. It can be one of the following: ['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', 'max_activation_variance', @@ -561,7 +575,7 @@ Full list of supported arguments available via ``--help`` performance. Use it when you intentionally want to use a stateless model, for example, to be compatible with existing OpenVINO native inference code that expects - kv-cache inputs and outputs in the model. + KV-cache inputs and outputs in the model. --disable-convert-tokenizer Do not add converted tokenizer and detokenizer OpenVINO models. @@ -585,7 +599,7 @@ compression: .. parsed-literal:: - 2024-11-22 01:15:09.417610: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:57:25.755800: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight'] - This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). @@ -636,9 +650,8 @@ OpenVINO `__ 3. `Stable Diffusion v2.1 using Optimum-Intel OpenVINO `__ 4. `Image generation with Stable Diffusion -XL `__ 5. `Instruction following using -Databricks Dolly 2.0 `__ 6. `Create -LLM-powered Chatbot using OpenVINO `__ 7. `Document -Visual Question Answering Using Pix2Struct and -OpenVINO `__ 8. `Automatic speech recognition -using Distil-Whisper and OpenVINO `__ +XL `__ 5. `Create LLM-powered Chatbot using +OpenVINO `__ 6. `Document Visual Question Answering +Using Pix2Struct and OpenVINO `__ 7. `Automatic +speech recognition using Distil-Whisper and +OpenVINO `__ diff --git a/docs/notebooks/hunyuan-dit-image-generation-with-output.rst b/docs/notebooks/hunyuan-dit-image-generation-with-output.rst index 01b20ab650824e..61c412fe6f5e62 100644 --- a/docs/notebooks/hunyuan-dit-image-generation-with-output.rst +++ b/docs/notebooks/hunyuan-dit-image-generation-with-output.rst @@ -36,6 +36,7 @@ using OpenVINO. Additionally, we will use `NNCF `__ for optimizing model in low precision. + **Table of contents:** - `Prerequisites <#prerequisites>`__ diff --git a/docs/notebooks/image-classification-quantization-with-output.rst b/docs/notebooks/image-classification-quantization-with-output.rst index 491ca0eed2881a..177ffd97209a57 100644 --- a/docs/notebooks/image-classification-quantization-with-output.rst +++ b/docs/notebooks/image-classification-quantization-with-output.rst @@ -194,7 +194,7 @@ Preprocessing for model obtained from training .. parsed-literal:: - 100%|██████████| 170498071/170498071 [00:07<00:00, 23705445.93it/s] + 100%|██████████| 170498071/170498071 [00:07<00:00, 22538385.96it/s] .. parsed-literal:: @@ -266,10 +266,10 @@ about supported parameters can be found on this .. parsed-literal:: - 2024-11-22 01:15:46.610115: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:15:46.641664: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 01:58:02.605724: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:58:02.638370: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 01:15:47.181563: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:58:03.190744: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -431,7 +431,7 @@ Tool `__ to speed up pipeline. + **Table of contents:** - `Prerequisites <#prerequisites>`__ @@ -82,7 +83,6 @@ pipeline. pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - `Interactive demo <#interactive-demo>`__ - Installation Instructions ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/notebooks/janus-multimodal-generation-with-output.rst b/docs/notebooks/janus-multimodal-generation-with-output.rst new file mode 100644 index 00000000000000..a8a5cc599699c6 --- /dev/null +++ b/docs/notebooks/janus-multimodal-generation-with-output.rst @@ -0,0 +1,472 @@ +Multimodal understanding and generation with Janus and OpenVINO +=============================================================== + +Janus is a novel autoregressive framework that unifies multimodal +understanding and generation. It addresses the limitations of previous +approaches by decoupling visual encoding into separate pathways, while +still utilizing a single, unified transformer architecture for +processing. The decoupling not only alleviates the conflict between the +visual encoder’s roles in understanding and generation, but also +enhances the framework’s flexibility. Janus surpasses previous unified +model and matches or exceeds the performance of task-specific models. +The simplicity, high flexibility, and effectiveness of Janus make it a +strong candidate for next-generation unified multimodal models. + +More details can be found in the +`paper `__, original +`repository `__ and `model +card `__ + +In this tutorial we consider how to run and optimize Janus using +OpenVINO. + +**Table of contents:** + +- `Prerequisites <#prerequisites>`__ +- `Convert and Optimize model <#convert-and-optimize-model>`__ + + - `Compress model weights to + 4-bit <#compress-model-weights-to-4-bit>`__ + +- `Create Inference Pipeline <#create-inference-pipeline>`__ + + - `Select Inference Device <#select-inference-device>`__ + - `Run visual language chat <#run-visual-language-chat>`__ + - `Run Image generation <#run-image-generation>`__ + +- `Interactive demo <#interactive-demo>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +.. code:: ipython3 + + from pathlib import Path + import requests + + utility_files = ["notebook_utils.py"] + local_helpers = ["ov_janus_helper.py", "gradio_helper.py"] + + base_utils_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" + base_local_files_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/janus-multimodal-generation/" + + + for util_path in utility_files: + if not Path(util_path).exists(): + r = requests.get(base_utils_url + util_path) + with open(util_path, "w") as f: + f.write(r.text) + + for util_path in local_helpers: + if not Path(util_path).exists(): + r = requests.get(base_local_files_url + util_path) + with open(util_path, "w") as f: + f.write(r.text) + +.. code:: ipython3 + + import platform + + %pip install -q "gradio>=4.19" "torch>=2.2" "torchvision" "safetensors" "transformers>=4.38" "nncf>=2.14" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "git+https://github.com/deepseek-ai/Janus" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -U --pre "openvino>2024.5" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" + +Convert and Optimize model +-------------------------- + + + +Janus is PyTorch model. OpenVINO supports PyTorch models via conversion +to OpenVINO Intermediate Representation (IR). `OpenVINO model conversion +API `__ +should be used for these purposes. ``ov.convert_model`` function accepts +original PyTorch model instance and example input for tracing and +returns ``ov.Model`` representing this model in OpenVINO framework. +Converted model can be used for saving on disk using ``ov.save_model`` +function or directly loading on device using ``core.complie_model``. + +The script ``ov_janus_helper.py`` contains helper function for model +conversion, please check its content if you interested in conversion +details. + +.. raw:: html + +
+ +.. raw:: html + + + +Click here for more detailed explanation of conversion steps + +.. raw:: html + + + +Janus is autoregressive transformer generative model, it means that each +next model step depends from model output from previous step. The +generation approach is based on the assumption that the probability +distribution of a token sequence can be decomposed into the product of +conditional next token distributions. In other words, model predicts the +next token in the loop guided by previously generated tokens until the +stop-condition will be not reached (generated sequence of maximum length +or end of generation token obtained). The way the next token will be +selected over predicted probabilities is driven by the selected decoding +methodology. You can find more information about the most popular +decoding methods in this blog. The entry point for the generation +process for models from the Hugging Face Transformers library is the +``generate`` method. You can find more information about its parameters +and configuration in the documentation. To preserve flexibility in the +selection decoding methodology, we will convert only model inference for +one step. + +For both tasks, image understanding and image generation, Janus utilizes +the same basic transformer architecture in ``language_model`` and change +only components responsible for preparing input embeddings (joined image +embeddings prepared using ``vision_embeddings_model`` and text +embeddings prepared using ``text_embeddings_model`` for image +understanding and ``text_embeddings_model`` on the first step as initial +prompt embeddings and ``gen_embeddings_model`` for the next) and +conversion final hidden state to tokens probabilities (``lm_head`` for +text tokens, ``gen_head`` for image tokens). Additionally, for image +generation model uses ``gen_decoder`` to convert generated image tokens +to images. + +To sum up above, model consists of 7 parts: \* **Image Embeddings** for +encoding input images into embedding space in image understanding task. +\* **Text Embedding** for conversion input text tokens into embedding +space \* **Gen Embeddings** for encoding image generation tokens to +embeddings space in image generation task \* **Language Model** for +generation hidden state guided by input embeddings \* **LM Head** for +conversion Language Model hidden state to text generation token +probabilities \* **Gen Head** for conversion Language Model hidden state +to image generation token probabilities \* **Gen Decoder** for decoding +generated image from latent token space to image tensor space. + +For preserving original model flexibility of switching between tasks, we +also should preserve original model partitioning and convert each model +part separately. + +.. raw:: html + +
+ +Compress model weights to 4-bit +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For reducing memory +consumption, weights compression optimization can be applied using +`NNCF `__. + +.. raw:: html + +
+ +.. raw:: html + + + +Click here for more details about weight compression + +.. raw:: html + + + +Weight compression aims to reduce the memory footprint of a model. It +can also lead to significant performance improvement for large +memory-bound models, such as Large Language Models (LLMs). LLMs and +other models, which require extensive memory to store the weights during +inference, can benefit from weight compression in the following ways: + +- enabling the inference of exceptionally large models that cannot be + accommodated in the memory of the device; + +- improving the inference performance of the models by reducing the + latency of the memory access when computing the operations with + weights, for example, Linear layers. + +`Neural Network Compression Framework +(NNCF) `__ provides 4-bit / +8-bit mixed weight quantization as a compression method primarily +designed to optimize LLMs. The main difference between weights +compression and full model quantization (post-training quantization) is +that activations remain floating-point in the case of weights +compression which leads to a better accuracy. Weight compression for +LLMs provides a solid inference performance improvement which is on par +with the performance of the full model quantization. In addition, weight +compression is data-free and does not require a calibration dataset, +making it easy to use. + +``nncf.compress_weights`` function can be used for performing weights +compression. The function accepts an OpenVINO model and other +compression parameters. Compared to INT8 compression, INT4 compression +improves performance even more, but introduces a minor drop in +prediction quality. + +More details about weights compression, can be found in `OpenVINO +documentation `__. + +.. raw:: html + +
+ +.. code:: ipython3 + + import nncf + from ov_janus_helper import convert_janus_model + + model_id = "deepseek-ai/Janus-1.3B" + model_path = Path(model_id.split("/")[-1] + "-ov") + + compression_configuration = { + "mode": nncf.CompressWeightsMode.INT4_ASYM, + "group_size": 64, + "ratio": 1.0, + } + + # uncomment the line to see model conversion code + # ??convert_janus_model + + +.. parsed-literal:: + + INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino + + +.. parsed-literal:: + + 2024-11-26 20:09:59.629857: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-11-26 20:09:59.643309: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered + WARNING: All log messages before absl::InitializeLog() is called are written to STDERR + E0000 00:00:1732637399.658322 1754417 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered + E0000 00:00:1732637399.662894 1754417 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered + 2024-11-26 20:09:59.679869: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + + +.. parsed-literal:: + + Python version is above 3.10, patching the collections module. + + +.. parsed-literal:: + + /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/auto/image_processing_auto.py:520: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead + warnings.warn( + + +.. code:: ipython3 + + convert_janus_model(model_id, model_path, compression_configuration) + + +.. parsed-literal:: + + ✅ Janus-1.3B model already converted. You can find results in Janus-1.3B-ov + + +Create Inference Pipeline +------------------------- + + + +``OVJanusModel`` defined in ``ov_janus_helper.py`` provides unified +interface for running model inference for both text and image +generation. It accepts model directory and target device for inference. + +Select Inference Device +~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget("CPU", ["NPU"]) + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +.. code:: ipython3 + + from ov_janus_helper import OVJanusModel + from janus.models import VLChatProcessor + + # uncomment the line to see model inference code + + # ??OVJanusModel + +``VLChatPRocessor`` class used for pre- and postprocessing steps in +original Janus model. Our model is also compatible with the same +processor code and we can reuse it. + +.. code:: ipython3 + + ov_model = OVJanusModel(model_path, device.value) + + processor = VLChatProcessor.from_pretrained(model_path) + + +.. parsed-literal:: + + Some kwargs in processor config are unused and will not have any effect: image_end_tag, sft_format, image_tag, num_image_tokens, add_special_token, mask_prompt, ignore_id, image_start_tag. + + +Run visual language chat +~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from PIL import Image + from io import BytesIO + from janus.utils.io import load_pil_images + + + input_prompt = "Describe image in details" + image_path = Path("cat_in_box.png") + + if not image_path.exists(): + response = requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11") + image = Image.open(BytesIO(response.content)).convert("RGB") + image.save(image_path) + + conversation = [ + { + "role": "User", + "content": f"{input_prompt}\n", + "images": [str(image_path)], + }, + {"role": "Assistant", "content": ""}, + ] + pil_images = load_pil_images(conversation) + +.. code:: ipython3 + + from transformers import TextStreamer + + prepare_inputs = processor(conversations=conversation, images=pil_images, force_batchify=True) + # run image encoder to get the image embeddings + inputs_embeds = ov_model.prepare_inputs_embeds(**prepare_inputs) + + streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) + + print(f"Question:\n{input_prompt}") + display(pil_images[0]) + print("Answer:") + + answer_token_ids = ov_model.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=prepare_inputs.attention_mask, + pad_token_id=processor.tokenizer.eos_token_id, + bos_token_id=processor.tokenizer.bos_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + max_new_tokens=128, + do_sample=False, + streamer=streamer, + ) + + +.. parsed-literal:: + + Question: + Describe image in details + + + +.. image:: janus-multimodal-generation-with-output_files/janus-multimodal-generation-with-output_14_1.png + + +.. parsed-literal:: + + Answer: + The image depicts a gray and white tabby cat lying comfortably inside a cardboard box. The cat is lying on its back with its legs and paws spread out in a relaxed manner. The cat's eyes are closed, and it appears to be enjoying a nap. The box is placed on a light-colored carpet, and in the background, there is a portion of a white couch visible. The lighting in the room is soft and natural, suggesting that the photo was taken during the daytime. The overall scene conveys a sense of tranquility and contentment. + + +Run Image generation +~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from ov_janus_helper import generate_image + + # Uncomment the line to see image generation code + # ??generate_image + +.. code:: ipython3 + + from transformers import set_seed + + set_seed(12345) + + images = generate_image( + ov_model, + processor, + "A close-up professional photo of Yorkshire Terrier on beach, extrimely detailed, hyper realistic, full hd", + output_dir=None, + parallel_size=1, + ) + + + +.. parsed-literal:: + + 0%| | 0/576 [00:00`__ is a Python library for +accelerator-oriented array computation and program transformation, +designed for high-performance numerical computing and large-scale +machine learning. JAX provides a familiar NumPy-style API for ease of +adoption by researchers and engineers. + +In this tutorial we will show how to convert JAX +`ViT `__ +and +`Mixer `__ +models in OpenVINO format. + +.. raw:: html + +
+ +.. raw:: html + + + +Click here for more detailed information about the models + +.. raw:: html + + + +Vision Transformer +~~~~~~~~~~~~~~~~~~ + +Overview of the model: authors split an image into fixed-size patches, +linearly embed each of them, add position embeddings, and feed the +resulting sequence of vectors to a standard Transformer encoder. In +order to perform classification, authors use the standard approach of +adding an extra learnable “classification token” to the sequence. + +MLP-Mixer +~~~~~~~~~ + +MLP-Mixer (Mixer for short) consists of per-patch linear embeddings, +Mixer layers, and a classifier head. Mixer layers contain one +token-mixing MLP and one channel-mixing MLP, each consisting of two +fully-connected layers and a GELU nonlinearity. Other components +include: skip-connections, dropout, and linear classifier head. + +.. raw:: html + +
+ + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Load and run the original model and a + sample <#load-and-run-the-original-model-and-a-sample>`__ +- `Convert the model to OpenVINO + IR <#convert-the-model-to-openvino-ir>`__ +- `Compiling the model <#compiling-the-model>`__ +- `Run OpenVINO model inference <#run-openvino-model-inference>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +.. code:: ipython3 + + import requests + + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", + ) + open("cmd_helper.py", "w").write(r.text) + +.. code:: ipython3 + + from cmd_helper import clone_repo + + + clone_repo("https://github.com/google-research/vision_transformer.git") + +.. code:: ipython3 + + %pip install -q "openvino>=2024.5.0" + %pip install -q Pillow "jax>=0.4.2" "absl-py>=0.12.0" "flax>=0.6.4" "pandas>=1.1.0" "tensorflow-cpu>=2.4.0" tf_keras tqdm "einops>=0.3.0" "ml-collections>=0.1.0" + +.. code:: ipython3 + + import PIL + import jax + import numpy as np + + from vit_jax import checkpoint + from vit_jax import models_vit + from vit_jax import models_mixer + from vit_jax.configs import models as models_config + + import openvino as ov + +.. code:: ipython3 + + import ipywidgets as widgets + + available_models = ["ViT-B_32", "Mixer-B_16"] + + + model_to_use = widgets.Select( + options=available_models, + value=available_models[0], + description="Select model:", + disabled=False, + ) + + model_to_use + + + + +.. parsed-literal:: + + Select(description='Select model:', options=('ViT-B_32', 'Mixer-B_16'), value='ViT-B_32') + + + +Load and run the original model and a sample +-------------------------------------------- + + + +Download a pre-trained model. + +.. code:: ipython3 + + from notebook_utils import download_file + + + model_name = model_to_use.value + model_config = models_config.MODEL_CONFIGS[model_name] + + + if model_name.startswith("Mixer"): + # Download model trained on imagenet2012 + model_name_path = download_file(f"https://storage.googleapis.com/mixer_models/imagenet1k/{model_name}.npz", filename=f"{model_name}_imagenet2012.npz") + model = models_mixer.MlpMixer(num_classes=1000, **model_config) + else: + # Download model pre-trained on imagenet21k and fine-tuned on imagenet2012. + model_name_path = download_file( + f"https://storage.googleapis.com/vit_models/imagenet21k+imagenet2012/{model_name}.npz", filename=f"{model_name}_imagenet2012.npz" + ) + model = models_vit.VisionTransformer(num_classes=1000, **model_config) + + + +.. parsed-literal:: + + ViT-B_32_imagenet2012.npz: 0%| | 0.00/337M [00:00`__ +should be used for these purposes. ``ov.convert_model`` function accepts +original JAX model instance and example input for tracing and returns +``ov.Model`` representing this model in OpenVINO framework. Converted +model can be used for saving on disk using ``ov.save_model`` function or +directly loading on device using ``core.complie_model``. + +Before conversion we need to create the +`Jaxprs `__ +(JAX’s internal intermediate representation (IR) of programs) object by +tracing a Python function using the +`jax.make_jaxpr `__ +function. [``jax.make_jaxpr``] take a function as argument, that should +perform the forward pass. In our case it is calling of ``model.apply`` +method. But ``model.apply`` requires not only input data, but also +``params`` and keyword argument ``train=False`` in our case. To handle +it create a wrapper function ``model_apply`` that calls +``model.apply(params, x, train=False)``. + +.. code:: ipython3 + + from pathlib import Path + + + model_path = Path(f"models/{model_name}.xml") + + + def model_apply(x): + return model.apply(dict(params=params), x, train=False) + + + jaxpr = jax.make_jaxpr(model_apply)((np.array(img) / 128 - 1)[None, ...]) + + converted_model = ov.convert_model(jaxpr) + ov.save_model(converted_model, model_path) + +Compiling the model +------------------- + + + +Select device from dropdown list for running inference using OpenVINO. + +.. code:: ipython3 + + from notebook_utils import device_widget + + + core = ov.Core() + + device = device_widget() + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') + + + +.. code:: ipython3 + + compiled_model = core.compile_model(model_path, device.value) + +Run OpenVINO model inference +---------------------------- + +.. code:: ipython3 + + (logits_ov,) = list(compiled_model(data).values())[0] + + preds = np.array(jax.nn.softmax(logits_ov)) + for idx in preds.argsort()[:-11:-1]: + print(f"{preds[idx]:.5f} : {imagenet_labels[idx]}", end="") + + +.. parsed-literal:: + + 0.95255 : alp + 0.03881 : valley, vale + 0.00192 : cliff, drop, drop-off + 0.00173 : ski + 0.00059 : lakeside, lakeshore + 0.00049 : promontory, headland, head, foreland + 0.00036 : volcano + 0.00021 : snowmobile + 0.00017 : mountain_bike, all-terrain_bike, off-roader + 0.00017 : mountain_tent + diff --git a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg new file mode 100644 index 00000000000000..4e389f1fcb75af --- /dev/null +++ b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9ce29fc2d800faa2667de9fc47770370f12c829217c22142bfcd1f5e1a2752 +size 33195 diff --git a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png new file mode 100644 index 00000000000000..901c02bacbed30 --- /dev/null +++ b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe240660061089dfc38c95d77b074051cc37b794c4d096e5841cf8d575311d9 +size 237944 diff --git a/docs/notebooks/knowledge-graphs-conve-with-output.rst b/docs/notebooks/knowledge-graphs-conve-with-output.rst index aa8b1a20ea554f..4d01d076afd676 100644 --- a/docs/notebooks/knowledge-graphs-conve-with-output.rst +++ b/docs/notebooks/knowledge-graphs-conve-with-output.rst @@ -196,19 +196,19 @@ Settings: Including path to the serialized model files and input data files .. parsed-literal:: - data/kg_training_entids.txt: 0%| | 0.00/3.79k [00:00`__ .. parsed-literal:: - 2024-11-22 01:21:24.800927: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:21:24.825776: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:10:00.149367: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:10:00.174583: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -373,14 +373,14 @@ Vision model accept ``pixel_values`` and returns ``image_embeds``. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:452: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:452: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:519: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:519: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:559: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:559: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): @@ -408,7 +408,7 @@ Convert Image To Text Projection model .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:165: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:168: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) if a.grad is not None: @@ -543,13 +543,13 @@ generated text by ``AutoProcessor``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:859: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:859: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if max_pos > self.weights.size(0): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if input_shape[-1] > 1: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:975: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:975: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attention_mask.size() != (batch_size, 1, seq_length, src_len): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1261: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1261: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: @@ -1391,9 +1391,9 @@ pipelines, we use mean inference time on 7 samples. .. parsed-literal:: - FP32 pipeline: 2.727 seconds - Optimized pipeline: 1.146 seconds - Performance speed-up: 2.380 + FP32 pipeline: 2.760 seconds + Optimized pipeline: 1.136 seconds + Performance speed-up: 2.430 Interactive inference diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg index c4966e68a0f7c6..8cbf8c6845558b 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d99c65937fed48b5c1ef214891a3ded6fc4acabbad731ecafdf30d897cd8807b -size 121119 +oid sha256:90eb5c813dbef6b48b4d6e6acca89940550e650f29648178615bc5b73cfbad07 +size 123201 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png index 717e205ccbaa23..76747126a0b8a7 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e416163b28e55e213c884e64462792c0cb5f9ae1389961c3a5467ef2c1ac101 -size 1150960 +oid sha256:2c680f410cf278d774523ad5338a2a1c4a5fe705113306c7abbec065c2108968 +size 1150690 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png index 85633bcfcf04ae..3a29f664a441a1 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7561941945a717b6a4f6e6bda157e86c62c5ff638acad518558c176a0ba21be5 -size 1149449 +oid sha256:39a74767a21f27ea1076d4d999630d18c019b8de712c05c75fca7ef1a7979199 +size 1148499 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg index 5aed31c2359d29..6586a554fa5fcc 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de647e8e1a39e8ee78c7c90a14f373b972e4f381f3348d6b28d0fe18a912eb51 -size 122484 +oid sha256:18799247eb9a64ea7a8828cd7587fcc1b428cc2d5e300dcf64393ce9bd0e4bc9 +size 124329 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png index 5eb34946e278d0..0193662b0a661b 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77941b5ac0c4ca3379b3a66eb94aeaa24b8c68e225f6e9369ca1cb262feaab7a -size 1150730 +oid sha256:ea65e060c07381de785e4c03e02fadd599b89d605a00be7e62987cb582d00d97 +size 1150941 diff --git a/docs/notebooks/language-quantize-bert-with-output.rst b/docs/notebooks/language-quantize-bert-with-output.rst index 2ba6bca451ad0b..e9c92052b26bae 100644 --- a/docs/notebooks/language-quantize-bert-with-output.rst +++ b/docs/notebooks/language-quantize-bert-with-output.rst @@ -101,8 +101,8 @@ Imports .. parsed-literal:: - 2024-11-22 01:28:13.948145: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:28:13.973147: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:16:53.582571: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:16:53.608080: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -149,7 +149,7 @@ Perform the following: .. parsed-literal:: - model/MRPC.zip: 0%| | 0.00/387M [00:00=2.1.0" "torchvision" "torchaudio" --index-url https://download.pytorch.org/whl/cpu + %pip install -q "torch>=2.3.0" "torchvision" "torchaudio" --index-url https://download.pytorch.org/whl/cpu %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --index-url https://download.pytorch.org/whl/cpu %pip install -q "nncf>=2.14.0" "sentencepiece" "tokenizers>=0.12.1" "transformers>=4.45.0" "gradio>=4.36" %pip install -q -U "openvino-tokenizers>=2024.5.0" "openvino>=2024.5.0" "openvino-genai>=2024.5.0"| diff --git a/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst b/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst index dc2a129c207ec5..6696ee663a8a30 100644 --- a/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst +++ b/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst @@ -59,9 +59,9 @@ Prerequisites .. code:: ipython3 - # %pip install -q "nncf>=2.14.0" "torch>=2.1" "transformers>=4.39.1" "accelerate" "pillow" "gradio>=4.26" "datasets>=2.14.6" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - # %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5" - # %pip install -q "git+https://github.com/hugggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "nncf>=2.14.0" "torch>=2.1" "transformers>=4.39.1" "accelerate" "pillow" "gradio>=4.26" "datasets>=2.14.6" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5" + %pip install -q "git+https://github.com/hugggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu .. code:: ipython3 diff --git a/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst b/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst index 6aa437b9f2d37a..8f94b7ce67973a 100644 --- a/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst +++ b/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst @@ -230,7 +230,7 @@ code: if repo_name == "OpenVINO": hf_hub.snapshot_download(llm_model_id.value, local_dir=llm_model_path) else: - !optimum_cli(llm_model_id.value, llm_model_path, additional_args=-{"task": "text-generation-with-past", "weight-format": "int4"}) + optimum_cli(llm_model_id.value, llm_model_path, additional_args=-{"task": "text-generation-with-past", "weight-format": "int4"}) Download Embedding model ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/notebooks/llm-agent-react-langchain-with-output.rst b/docs/notebooks/llm-agent-react-langchain-with-output.rst index 2b1b289f90db0b..9adb0311542426 100644 --- a/docs/notebooks/llm-agent-react-langchain-with-output.rst +++ b/docs/notebooks/llm-agent-react-langchain-with-output.rst @@ -66,6 +66,29 @@ Prerequisites +.. code:: ipython3 + + import requests + from pathlib import Path + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + + if not Path("cmd_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") + open("cmd_helper.py", "w", encoding="utf-8").write(r.text) + + + + +.. parsed-literal:: + + 1491 + + + .. code:: ipython3 import os @@ -74,16 +97,27 @@ Prerequisites %pip install -Uq pip %pip uninstall -q -y optimum optimum-intel - %pip install --pre -Uq "openvino>=2024.2.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \ + %pip install --pre -Uq "openvino>=2024.5.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "transformers>=4.38.1" "langchain>=0.2.3" "langchain-huggingface>=0.1.2" "langchain-community>=0.2.4" "Wikipedia" \ "torch>=2.1" \ "datasets" \ "accelerate" \ + "pydantic<2.10.0" \ "gradio>=4.19" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "transformers>=4.38.1" "langchain>=0.2.3" "langchain-community>=0.2.4" "Wikipedia" %pip install -q "git+https://github.com/huggingface/optimum-intel.git" \ "git+https://github.com/openvinotoolkit/nncf.git" + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + + Create a tools -------------- @@ -178,7 +212,7 @@ previous agent tool invocations and the corresponding tool outputs. .. code:: ipython3 - PREFIX = """[INST]Respond to the human as helpfully and accurately as possible. You have access to the following tools:""" + PREFIX = """Respond to the human as helpfully and accurately as possible. You have access to the following tools:""" FORMAT_INSTRUCTIONS = """Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input). @@ -210,10 +244,10 @@ previous agent tool invocations and the corresponding tool outputs. "action": "Final Answer", "action_input": "Final response to human" }}}} - ```[/INST]""" + ```""" SUFFIX = """Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:. - Thought:[INST]""" + Thought:""" HUMAN_MESSAGE_TEMPLATE = "{input}\n\n{agent_scratchpad}" @@ -225,18 +259,32 @@ Create LLM Large Language Models (LLMs) are a core component of LangChain. LangChain does not serve its own LLMs, but rather provides a standard interface for interacting with many different LLMs. In this example, we -select ``Mistral-7B-Instruct-v0.3`` as LLM in agent pipeline. - -- **Mistral-7B-Instruct-v0.3** - The Mistral-7B-Instruct-v0.3 Large - Language Model (LLM) is an instruct fine-tuned version of the - Mistral-7B-v0.3. You can find more details about model in the `model - card `__, - `paper `__ and `release blog - post `__. +select following models as LLM in agent pipeline. + +- **qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - + Qwen2.5 is the latest series of Qwen large language models. Comparing + with Qwen2, Qwen2.5 series brings significant improvements in coding, + mathematics and general knowledge skills. Additionally, it brings + long-context and multiple languages support including Chinese, + English, French, Spanish, Portuguese, German, Italian, Russian, + Japanese, Korean, Vietnamese, Thai, Arabic, and more. For more + details, please refer to + `model_card `__, + `blog `__, + `GitHub `__, and + `Documentation `__. +- **llama-3.1-8b-instruct** - The Llama 3.1 instruction tuned text only + models (8B, 70B, 405B) are optimized for multilingual dialogue use + cases and outperform many of the available open source and closed + chat models on common industry benchmarks. More details about model + can be found in `Meta blog + post `__, `model + website `__ and `model + card `__. >\ **Note**: run model with demo, you will need to accept license agreement. >You must be a registered user in Hugging Face Hub. Please visit `HuggingFace model - card `__, + card `__, carefully read terms of usage and click accept button. You will need to use an access token for the code below to run. For more information on access tokens, refer to `this section of the @@ -269,31 +317,52 @@ folder. .. code:: ipython3 - from pathlib import Path + import ipywidgets as widgets + + llm_model_ids = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-3B-Instruct", "Qwen/qwen2.5-14b-instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct"] - model_id = "mistralai/Mistral-7B-Instruct-v0.3" - model_path = "Mistral-7B-Instruct-v0.3-ov-int4" + llm_model_id = widgets.Dropdown( + options=llm_model_ids, + value=llm_model_ids[0], + description="Model:", + disabled=False, + ) - if not Path(model_path).exists(): - !optimum-cli export openvino --model {model_id} --task text-generation-with-past --trust-remote-code --weight-format int4 {model_path} + llm_model_id -Select inference device for LLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. parsed-literal:: + + Dropdown(description='Model:', options=('Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-3B-Instruct', 'Qwen/qwen2.5-… .. code:: ipython3 - import requests + from cmd_helper import optimum_cli - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) + llm_model_path = llm_model_id.value.split("/")[-1] + repo_name = llm_model_id.value.split("/")[0] + if not Path(llm_model_path).exists(): + optimum_cli( + llm_model_id.value, llm_model_path, additional_args={"task": "text-generation-with-past", "weight-format": "int4", "group-size": "128", "ratio": "1.0"} + ) + +Select inference device for LLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + from notebook_utils import device_widget device = device_widget("CPU", exclude=["NPU"]) + + device @@ -312,7 +381,7 @@ information `__. .. code:: ipython3 - from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline + from langchain_huggingface import HuggingFacePipeline from transformers.generation.stopping_criteria import StoppingCriteriaList, StoppingCriteria import openvino.properties as props @@ -346,7 +415,7 @@ information `__. stop_tokens = ["Observation:"] ov_llm = HuggingFacePipeline.from_model_id( - model_id=model_path, + model_id=llm_model_path, task="text-generation", backend="openvino", model_kwargs={ @@ -356,26 +425,16 @@ information `__. }, pipeline_kwargs={"max_new_tokens": 2048}, ) - ov_llm = ov_llm.bind(skip_prompt=True, stop=["Observation:"]) tokenizer = ov_llm.pipeline.tokenizer ov_llm.pipeline._forward_params["stopping_criteria"] = StoppingCriteriaList([StopSequenceCriteria(stop_tokens, tokenizer)]) +.. code:: ipython3 -.. parsed-literal:: - - 2024-06-07 23:17:16.804739: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-06-07 23:17:16.807973: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-06-07 23:17:16.850235: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - 2024-06-07 23:17:16.850258: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - 2024-06-07 23:17:16.850290: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-06-07 23:17:16.859334: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-06-07 23:17:17.692415: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers - The argument `trust_remote_code` is to be used along with export=True. It will be ignored. - Compiling the model to GPU ... - + from langchain_huggingface import ChatHuggingFace + + ov_chat = ChatHuggingFace(llm=ov_llm, verbose=True) + ov_chat = ov_chat.bind(skip_prompt=True, stop=["Observation:"]) You can get additional inference speed improvement with `Dynamic Quantization of activations and KV-cache quantization on @@ -409,7 +468,7 @@ outputs back to the agent, and repeats. from langchain.agents import AgentExecutor, StructuredChatAgent agent = StructuredChatAgent.from_llm_and_tools( - ov_llm, + ov_chat, tools, prefix=PREFIX, suffix=SUFFIX, @@ -438,57 +497,68 @@ prompt template. > Entering new AgentExecutor chain... - Thought: I can use the exponentiate and add tools to solve the first part, and then use the multiply tool for the second part, and finally the exponentiate tool again to square the result. + Thought: First, we need to take 3 to the fifth power. Then we will find the sum of twelve and three. After that, we multiply the first result by the second result. Finally, we'll square the whole result. Action: ``` { "action": "exponentiate", - "action_input": {"base": 3, "exponent": 5} + "action_input": { + "base": 3, + "exponent": 5 + } } ``` Observation: Observation: 243 - Thought: Now I need to add twelve and three + Thought:Next, let's find the sum of twelve and three. Action: ``` { "action": "add", - "action_input": {"first_int": 12, "second_int": 3} + "action_input": { + "first_int": 12, + "second_int": 3 + } } ``` Observation: Observation: 15 - Thought: Now I need to multiply the result by 243 + Thought:Now, we will multiply the result of \(3^5\) (which is 243) by the sum of 12 and 3 (which is 15). Action: ``` { "action": "multiply", - "action_input": {"first_int": 243, "second_int": 15} + "action_input": { + "first_int": 243, + "second_int": 15 + } } ``` Observation: Observation: 3645 - Thought: Finally, I need to square the result + Thought:Thought: Now, we need to square the result of the multiplication (3645). Action: ``` { "action": "exponentiate", - "action_input": {"base": 3645, "exponent": 2} + "action_input": { + "base": 3645, + "exponent": 2 + } } ``` - Observation: Observation: 13286025 - Thought: I know what to respond + Thought:Thought: I know what to respond Action: ``` { "action": "Final Answer", - "action_input": "The final answer is 13286025" + "action_input": "The final result is 13286025." } ``` @@ -500,7 +570,7 @@ prompt template. .. parsed-literal:: {'input': 'Take 3 to the fifth power and multiply that by the sum of twelve and three, then square the whole result', - 'output': 'The final answer is 13286025'} + 'output': 'The final result is 13286025.'} @@ -566,7 +636,7 @@ words generated by agent. .. parsed-literal:: - 'Page: OpenVINO\nSummary: OpenVINO is an open-source software toolkit for optimizing and deploying deep learning models. It enables programmers to develop scalable and efficient AI solutions with relatively few lines of code. It supports several popular model formats and categories, such as large language models, computer vision, and generative AI.\nActively developed by Intel, it prioritizes high-performance inference on Intel hardware but also supports ARM/ARM64 processors and encourages contributors to add new devices to the portfolio.\nBased in C++, it offers the following APIs: C/C++, Python, and Node.js (an early preview).\nOpenVINO is cross-platform and free for use under Apache License 2.0.\n\nPage: Stable Diffusion\nSummary: Stable Diffusion is a deep learning, text-to-image model released in 2022 based on diffusion techniques. It is considered to be a part of the ongoing artificial intelligence boom.\nIt is primarily used to generate detailed images conditioned on text descriptions, t' + 'Page: OpenVINO\nSummary: OpenVINO is an open-source software toolkit for optimizing and deploying deep learning models. It enables programmers to develop scalable and efficient AI solutions with relatively few lines of code. It supports several popular model formats and categories, such as large language models, computer vision, and generative AI.\nActively developed by Intel, it prioritizes high-performance inference on Intel hardware but also supports ARM/ARM64 processors and encourages contributors to add new devices to the portfolio.\nBased in C++, it offers the following APIs: C/C++, Python, and Node.js (an early preview).\nOpenVINO is cross-platform and free for use under Apache License 2.0.\n\nPage: Audacity (audio editor)\nSummary: Audacity is a free and open-source digital audio editor and recording application software, available for Windows, macOS, Linux, and other Unix-like operating systems. \nAs of December 6, 2022, Audacity is the most popular download at FossHub, with over 114.' @@ -643,7 +713,7 @@ In this examples, we will create 2 customized tools for .. parsed-literal:: - "{'current_condition': {'temp_C': '9', 'FeelsLikeC': '8', 'humidity': '93', 'weatherDesc': [{'value': 'Sunny'}], 'observation_time': '04:39 AM'}}" + "{'current_condition': {'temp_C': '0', 'FeelsLikeC': '-4', 'humidity': '86', 'weatherDesc': [{'value': 'Clear'}], 'observation_time': '12:16 AM'}}" @@ -657,7 +727,7 @@ Create AI agent demo with Gradio UI tools = [wikipedia, painting, weather] agent = StructuredChatAgent.from_llm_and_tools( - ov_llm, + ov_chat, tools, prefix=PREFIX, suffix=SUFFIX, @@ -703,7 +773,7 @@ Create AI agent demo with Gradio UI def request_cancel(): - ov_llm.pipeline.model.request.cancel() + ov_chat.llm.pipeline.model.request.cancel() .. code:: ipython3 @@ -723,50 +793,6 @@ Create AI agent demo with Gradio UI # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. parsed-literal:: - - - - > Entering new AgentExecutor chain... - Thought: I need to use the weather tool to get the current weather in London, then use the painting tool to generate a picture of Big Ben based on the weather information. - - Action: - ``` - { - "action": "weather", - "action_input": "London" - } - ``` - - Observation: - Observation: {'current_condition': {'temp_C': '9', 'FeelsLikeC': '8', 'humidity': '93', 'weatherDesc': [{'value': 'Sunny'}], 'observation_time': '04:39 AM'}} - Thought: I have the current weather in London. Now I can use the painting tool to generate a picture of Big Ben based on the weather information. - - Action: - ``` - { - "action": "painting", - "action_input": "Big Ben, sunny day" - } - ``` - - Observation: - Observation: {image_url: "https://image.pollinations.ai/prompt/Big%20Ben%2C%20sunny%20day"} - Thought: I have the image URL of Big Ben on a sunny day. Now I can respond to the human with the image URL. - - Action: - ``` - { - "action": "Final Answer", - "action_input": "Here is the image of Big Ben on a sunny day: https://image.pollinations.ai/prompt/Big%20Ben%2C%20sunny%20day" - } - ``` - Observation: - - > Finished chain. - - .. code:: ipython3 # please uncomment and run this cell for stopping gradio interface diff --git a/docs/notebooks/llm-agent-react-with-output.rst b/docs/notebooks/llm-agent-react-with-output.rst index aced34d99d90bd..791355276fd2fd 100644 --- a/docs/notebooks/llm-agent-react-with-output.rst +++ b/docs/notebooks/llm-agent-react-with-output.rst @@ -62,22 +62,22 @@ Prerequisites import os import requests - - + + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) open("notebook_utils.py", "w").write(r.text) - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", ) open("pip_helper.py", "w").write(r.text) - + os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - + from pip_helper import pip_install - + pip_install( "-q", "--extra-index-url", @@ -106,9 +106,7 @@ folder. Large Language Models (LLMs) are a core component of agent. LlamaIndex does not serve its own LLMs, but rather provides a standard interface for interacting with many different LLMs. In this example, we can select -``Qwen2.5`` as LLM in agent pipeline. - - +``Qwen2.5`` as LLM in agent pipeline. \* **qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - Qwen2.5 is the latest series of Qwen large language models. Comparing with Qwen2, Qwen2.5 series brings significant improvements in coding, @@ -124,16 +122,16 @@ Vietnamese, Thai, Arabic, and more. For more details, please refer to .. code:: ipython3 import ipywidgets as widgets - + llm_model_ids = ["Qwen/Qwen2.5-3B-Instruct", "Qwen/Qwen2.5-7B-Instruct", "Qwen/qwen2.5-14b-instruct"] - + llm_model_id = widgets.Dropdown( options=llm_model_ids, value=llm_model_ids[0], description="Model:", disabled=False, ) - + llm_model_id @@ -148,9 +146,9 @@ Vietnamese, Thai, Arabic, and more. For more details, please refer to .. code:: ipython3 from pathlib import Path - + llm_model_path = llm_model_id.value.split("/")[-1] - + if not Path(llm_model_path).exists(): !optimum-cli export openvino --model {llm_model_id.value} --task text-generation-with-past --trust-remote-code --weight-format int4 --group-size 128 --ratio 1.0 --sym {llm_model_path} @@ -162,9 +160,9 @@ Select inference device for LLM .. code:: ipython3 from notebook_utils import device_widget - + llm_device = device_widget("CPU", exclude=["NPU"]) - + llm_device @@ -228,15 +226,15 @@ guide `__ import openvino.properties as props import openvino.properties.hint as hints import openvino.properties.streams as streams - + import json import json5 import torch - + tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True) - + ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - + llm = OVModelForCausalLM.from_pretrained( llm_model_path, device=llm_device.value, @@ -244,7 +242,7 @@ guide `__ config=AutoConfig.from_pretrained(llm_model_path, trust_remote_code=True), trust_remote_code=True, ) - + llm.generation_config.top_k = 1 llm.generation_config.max_length = 2000 @@ -262,31 +260,31 @@ received from tool calling.. class StopSequenceCriteria(StoppingCriteria): """ This class can be used to stop generation whenever a sequence of tokens is encountered. - + Args: stop_sequences (`str` or `List[str]`): The sequence (or list of sequences) on which to stop execution. tokenizer: The tokenizer used to decode the model outputs. """ - + def __init__(self, stop_sequences, tokenizer): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] self.stop_sequences = stop_sequences self.tokenizer = tokenizer - + def __call__(self, input_ids, scores, **kwargs) -> bool: decoded_output = self.tokenizer.decode(input_ids.tolist()[0]) return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences) - - + + def text_completion(prompt: str, stop_words) -> str: im_end = "<|im_end|>" if im_end not in stop_words: stop_words = stop_words + [im_end] streamer = TextStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) - + stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop_words, tokenizer)]) input_ids = torch.tensor([tokenizer.encode(prompt)]) generate_kwargs = dict( @@ -299,7 +297,7 @@ received from tool calling.. output = tokenizer.decode(output, errors="ignore") assert output.startswith(prompt) output = output[len(prompt) :].replace("<|endoftext|>", "").replace(im_end, "") - + for stop_str in stop_words: idx = output.find(stop_str) if idx != -1: @@ -341,13 +339,13 @@ parameter should be a sequence of messages that contains the .. code:: ipython3 TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}""" - + PROMPT_REACT = """Answer the following questions as best you can. You have access to the following APIs: - + {tools_text} - + Use the following format: - + Question: the input question you must answer Thought: you should always think about what to do Action: the action to take, should be one of [{tools_name_text}] @@ -356,9 +354,9 @@ parameter should be a sequence of messages that contains the ... (this Thought/Action/Action Input/Observation can be repeated zero or more times) Thought: I now know the final answer Final Answer: the final answer to the original input question - + Begin! - + Question: {query}""" Meanwhile we have to create function for consolidate the tools @@ -383,9 +381,9 @@ information and conversation history into the prompt template. raise NotImplementedError tools_text.append(tool) tools_text = "\n\n".join(tools_text) - + tools_name_text = ", ".join([tool_info["name_for_model"] for tool_info in list_of_tool_info]) - + messages = [{"role": "system", "content": "You are a helpful assistant."}] for i, (query, response) in enumerate(chat_history): if list_of_tool_info: @@ -399,9 +397,9 @@ information and conversation history into the prompt template. messages.append({"role": "user", "content": query}) if response: messages.append({"role": "assistant", "content": response}) - + prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, return_tensors="pt") - + return prompt Create parser @@ -495,7 +493,7 @@ execute them according to the output of LLM. return str(ret) elif tool_name == "image_gen": import urllib.parse - + tool_args = tool_args.replace("(", "").replace(")", "") prompt = json5.loads(tool_args)["prompt"] prompt = urllib.parse.quote(prompt) @@ -505,11 +503,11 @@ execute them according to the output of LLM. ) else: raise NotImplementedError - - + + def llm_with_tool(prompt: str, history, list_of_tool_info=()): chat_history = [(x["user"], x["bot"]) for x in history] + [(prompt, "")] - + planning_prompt = build_input_text(chat_history, list_of_tool_info) text = "" while True: @@ -524,7 +522,7 @@ execute them according to the output of LLM. else: text += output break - + new_history = [] new_history.extend(history) new_history.append({"user": prompt, "bot": text}) @@ -539,7 +537,7 @@ Run agent history = [] query = "get the weather in London, and create a picture of Big Ben based on the weather information" - + response, history = llm_with_tool(prompt=query, history=history, list_of_tool_info=tools) diff --git a/docs/notebooks/llm-chatbot-generate-api-with-output.rst b/docs/notebooks/llm-chatbot-generate-api-with-output.rst index 817a34011fde2d..c09b463ae985d0 100644 --- a/docs/notebooks/llm-chatbot-generate-api-with-output.rst +++ b/docs/notebooks/llm-chatbot-generate-api-with-output.rst @@ -81,9 +81,9 @@ Install required dependencies .. code:: ipython3 import os - + os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - + %pip install -Uq pip %pip uninstall -q -y optimum optimum-intel %pip install -q -U "openvino>=2024.3.0" openvino-tokenizers[transformers] openvino-genai @@ -103,12 +103,12 @@ Install required dependencies from pathlib import Path import requests import shutil - + # fetch model configuration - + config_shared_path = Path("../../utils/llm_config.py") config_dst_path = Path("llm_config.py") - + if not config_dst_path.exists(): if config_shared_path.exists(): try: @@ -127,7 +127,7 @@ Install required dependencies r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") with open("llm_config.py", "w", encoding="utf-8") as f: f.write(r.text) - + if not Path("notebook_utils.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") open("notebook_utils.py", "w").write(r.text) @@ -238,7 +238,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -270,7 +270,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -304,7 +304,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -338,7 +338,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -399,7 +399,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -432,7 +432,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -466,7 +466,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -500,7 +500,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -531,7 +531,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -644,9 +644,9 @@ Click here to see available models options .. code:: ipython3 from llm_config import get_llm_selection_widget - + form, lang, model_id_widget, compression_variant, use_preconverted = get_llm_selection_widget() - + form @@ -668,7 +668,7 @@ Click here to see available models options .. parsed-literal:: Selected model qwen2-0.5b-instruct with INT4 compression - + Convert model using Optimum-CLI tool ------------------------------------ @@ -676,7 +676,7 @@ Convert model using Optimum-CLI tool `Optimum Intel `__ -is the interface between the +is the interface between the `Transformers `__ and `Diffusers `__ libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. @@ -749,13 +749,12 @@ to make it `symmetric `__ you can add ``--sym``. -For INT4 quantization you can also specify the following arguments: - -- The ``--group-size`` parameter will define the group size to use for - quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit - quantization. If set to 0.9, it means that 90% of the layers will be - quantized to int4 while 10% will be quantized to int8. +For INT4 quantization you can also specify the following arguments : - +The ``--group-size`` parameter will define the group size to use for +quantization, -1 it will results in per-column quantization. - The +``--ratio`` parameter controls the ratio between 4-bit and 8-bit +quantization. If set to 0.9, it means that 90% of the layers will be +quantized to int4 while 10% will be quantized to int8. Smaller group_size and ratio values usually improve accuracy at the sacrifice of the model size and inference latency. You can enable AWQ to @@ -777,28 +776,28 @@ be additionally applied during model export with INT4 precision using .. code:: ipython3 from llm_config import convert_and_compress_model - + model_dir = convert_and_compress_model(model_id, model_configuration, compression_variant.value, use_preconverted.value) .. parsed-literal:: ✅ INT4 qwen2-0.5b-instruct model already converted and can be found in qwen2/INT4_compressed_weights - + Let’s compare model size for different compression types .. code:: ipython3 from llm_config import compare_model_size - + compare_model_size(model_dir) .. parsed-literal:: Size of model with INT4 compressed weights is 358.86 MB - + Select device for inference --------------------------- @@ -808,9 +807,9 @@ Select device for inference .. code:: ipython3 from notebook_utils import device_widget - + device = device_widget(default="CPU", exclude=["NPU"]) - + device @@ -853,14 +852,14 @@ of the available generation parameters more deeply later. .. code:: ipython3 import openvino_genai as ov_genai - + print(f"Loading model from {model_dir}\n") - - + + pipe = ov_genai.LLMPipeline(str(model_dir), device.value) - + generation_config = pipe.get_generation_config() - + input_prompt = "The Sun is yellow bacause" print(f"Input text: {input_prompt}") print(pipe.generate(input_prompt, max_new_tokens=10)) @@ -869,10 +868,10 @@ of the available generation parameters more deeply later. .. parsed-literal:: Loading model from qwen2/INT4_compressed_weights - + Input text: The Sun is yellow bacause it is made of hydrogen and oxygen atoms. The - + Run Chatbot ----------- @@ -1023,11 +1022,11 @@ Click here to see detailed description of advanced options if not Path("gradio_helper_genai.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-chatbot/gradio_helper_genai.py") open("gradio_helper_genai.py", "w").write(r.text) - + from gradio_helper_genai import make_demo - + demo = make_demo(pipe, model_configuration, model_id, lang.value) - + try: demo.launch(debug=True) except Exception: diff --git a/docs/notebooks/llm-chatbot-with-output.rst b/docs/notebooks/llm-chatbot-with-output.rst index 88dda48053d8ec..0d214f5cccc0fc 100644 --- a/docs/notebooks/llm-chatbot-with-output.rst +++ b/docs/notebooks/llm-chatbot-with-output.rst @@ -655,14 +655,13 @@ to make it `symmetric `__ you can add ``--sym``. -For INT4 quantization you can also specify the following arguments: - -- -The ``--group-size`` parameter will define the group size to use for -quantization, -1 it will results in per-column quantization. - The -``--ratio`` parameter controls the ratio between 4-bit and 8-bit -quantization. If set to 0.9, it means that 90% of the layers will be -quantized to int4 while 10% will be quantized to int8. +For INT4 quantization you can also specify the following arguments : + +- The ``--group-size`` parameter will define the group size to use for + quantization, -1 it will results in per-column quantization. +- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit + quantization. If set to 0.9, it means that 90% of the layers will be + quantized to int4 while 10% will be quantized to int8. Smaller group_size and ratio values usually improve accuracy at the sacrifice of the model size and inference latency. diff --git a/docs/notebooks/localai-with-output.rst b/docs/notebooks/localai-with-output.rst new file mode 100644 index 00000000000000..fac17b8d241d82 --- /dev/null +++ b/docs/notebooks/localai-with-output.rst @@ -0,0 +1,220 @@ +LocalAI and OpenVINO +==================== + +`LocalAI `__ is the free, Open Source OpenAI +alternative. LocalAI act as a drop-in replacement REST API that’s +compatible with OpenAI API specifications for local inferencing. It +allows you to run LLMs, generate images, audio (and not only) locally or +on-prem with consumer grade hardware, supporting multiple model families +and architectures. Does not require GPU. It is created and maintained by +``Ettore Di Giacinto``. + +In this tutorial we show how to prepare a model config and launch an +OpenVINO LLM model with LocalAI in docker container. + + +**Table of contents:** + + +- `Prepare Docker <#prepare-docker>`__ +- `Prepare a model <#prepare-a-model>`__ +- `Run the server <#run-the-server>`__ +- `Send a client request <#send-a-client-request>`__ +- `Stop the server <#stop-the-server>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prepare Docker +-------------- + +Install `Docker +Engine `__, including its +`post-installation `__ +steps, on your development system. To verify installation, test it, +using the following command. When it is ready, it will display a test +image and a message. + +.. code:: ipython3 + + !docker run hello-world + + +.. parsed-literal:: + + Unable to find image 'hello-world:latest' locally + latest: Pulling from library/hello-world + + Digest: sha256:305243c734571da2d100c8c8b3c3167a098cab6049c9a5b066b6021a60fcb966 + Status: Downloaded newer image for hello-world:latest + + Hello from Docker! + This message shows that your installation appears to be working correctly. + + To generate this message, Docker took the following steps: + 1. The Docker client contacted the Docker daemon. + 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. + (amd64) + 3. The Docker daemon created a new container from that image which runs the + executable that produces the output you are currently reading. + 4. The Docker daemon streamed that output to the Docker client, which sent it + to your terminal. + + To try something more ambitious, you can run an Ubuntu container with: + $ docker run -it ubuntu bash + + Share images, automate workflows, and more with a free Docker ID: + https://hub.docker.com/ + + For more examples and ideas, visit: + https://docs.docker.com/get-started/ + + + +Prepare a model +~~~~~~~~~~~~~~~ + + + +LocalAI allows to use customized models. For more details you can read +the +`instruction `__ +where you can also find the detailed documentation. We will use one of +the OpenVINO optimized LLMs in the collection on the `collection on +🤗Hugging +Face `__. +In this example we will use +`TinyLlama-1.1B-Chat-v1.0-fp16-ov `__. +First of all we should create a model configuration file: + +.. code:: yaml + + name: TinyLlama-1.1B-Chat-v1.0-fp16-ov + backend: transformers + parameters: + model: OpenVINO/TinyLlama-1.1B-Chat-v1.0-fp16-ov + temperature: 0.2 + top_k: 40 + top_p: 0.95 + max_new_tokens: 32 + + type: OVModelForCausalLM + + template: + chat_message: | + <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}} + {{if .Content}}{{.Content}}{{end}}<|im_end|> + chat: | + {{.Input}} + <|im_start|>assistant + + completion: | + {{.Input}} + + stopwords: + - <|im_end|> + +The fields ``backend``, ``model``, ``type`` you can find in the code +example on the model page (we added the corresponding comments): + +.. code:: python + + from transformers import AutoTokenizer # backend + from optimum.intel.openvino import OVModelForCausalLM # type + + model_id = "OpenVINO/TinyLlama-1.1B-Chat-v1.0-fp16-ov" # parameters.model + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = OVModelForCausalLM.from_pretrained(model_id) + +The name you can choose by yourself. By this name you will specify what +model to use on the client side. + +You can create a GitHub gist and modify fields: +`ov.yaml `__ + +Description of the parameters used in config YAML file can be found +`here `__. + +The most important: + +- ``name`` - model name, used to identify the model in API calls. +- ``backend`` - backend to use for computation (like llama-cpp, + diffusers, whisper, transformers). +- ``parameters.model`` - relative to the models path. +- ``temperature``, ``top_k``, ``top_p``, ``max_new_tokens`` - + parameters for the model. +- ``type`` - type of configuration, often related to the type of task + or model architecture. +- ``template`` - templates for various types of model interactions. +- ``stopwords`` - Words or phrases that halts processing. + +Run the server +~~~~~~~~~~~~~~ + + + +Everything is ready for launch. Use +``quay.io/go-skynet/local-ai:v2.23.0-ffmpeg`` image that contains all +required dependencies. For more details read `Run with container +images `__. +If you want to see the output remove the ``-d`` flag and send a client +request from a separate notebook. + +.. code:: ipython3 + + !docker run -d --rm --name="localai" -p 8080:8080 quay.io/go-skynet/local-ai:master-sycl-f16-ffmpeg https://gist.githubusercontent.com/aleksandr-mokrov/f007c8fa6036760a856ddc60f605a0b0/raw/9d24ceeb487f9c058a943113bd0290e8ae565b3e/ov.yaml + + +.. parsed-literal:: + + 67e1a2a8123aa15794c027278aed2c258a04e06883663459bbeaca22ff014740 + docker: Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running hook #1: error running hook: exit status 1, stdout: , stderr: Auto-detected mode as 'legacy' + nvidia-container-cli: requirement error: invalid expression: unknown. + + +Check whether the ``localai`` container is running normally: + +.. code:: ipython3 + + !docker ps | grep localai + +Send a client request +~~~~~~~~~~~~~~~~~~~~~ + + + +Now you can send HTTP requests using the model name +``TinyLlama-1.1B-Chat-v1.0-fp16-ov``. More details how to use `OpenAI +API `__. + +.. code:: ipython3 + + !curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{"model": "TinyLlama-1.1B-Chat-v1.0-fp16-ov", "prompt": "What is OpenVINO?"}' + + +.. parsed-literal:: + + curl: (7) Failed to connect to localhost port 8080: Connection refused + + +Stop the server +~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + !docker stop localai + + +.. parsed-literal:: + + Error response from daemon: No such container: localai + diff --git a/docs/notebooks/magika-content-type-recognition-with-output.rst b/docs/notebooks/magika-content-type-recognition-with-output.rst index 383fdc6eebf499..f15167eae183b1 100644 --- a/docs/notebooks/magika-content-type-recognition-with-output.rst +++ b/docs/notebooks/magika-content-type-recognition-with-output.rst @@ -41,8 +41,8 @@ post `__ - `Define model loading class <#define-model-loading-class>`__ diff --git a/docs/notebooks/meter-reader-with-output.rst b/docs/notebooks/meter-reader-with-output.rst index 713c4d68edae6a..0ac9308155d4b7 100644 --- a/docs/notebooks/meter-reader-with-output.rst +++ b/docs/notebooks/meter-reader-with-output.rst @@ -135,7 +135,7 @@ DeepLabV3P pre-trained models from PaddlePaddle community. .. parsed-literal:: - model/meter_det_model.tar.gz: 0%| | 0.00/192M [00:00 + diff --git a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst b/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst index 7f64dd936292c5..c130f9e0c08d67 100644 --- a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst +++ b/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst @@ -205,7 +205,7 @@ documentation [68 lines of output] + ╰─> [92 lines of output] Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment Collecting setuptools Using cached setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB) Collecting cython<3.0,>=0.25 Using cached Cython-0.29.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB) Collecting cymem<2.1.0,>=2.0.2 - Using cached cymem-2.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB) + Using cached cymem-2.0.10.tar.gz (10 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting preshed<3.1.0,>=3.0.2 Using cached preshed-3.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB) Collecting murmurhash<1.1.0,>=0.28.0 - Using cached murmurhash-1.0.10-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB) + Using cached murmurhash-1.0.11.tar.gz (13 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting thinc<8.4.0,>=8.3.0 Using cached thinc-8.3.2.tar.gz (193 kB) Installing build dependencies: started @@ -139,16 +151,28 @@ Prerequisites × pip subprocess to install build dependencies did not run successfully. │ exit code: 1 - ╰─> [38 lines of output] + ╰─> [50 lines of output] Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment Collecting setuptools Using cached setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB) Collecting cython<3.0,>=0.25 Using cached Cython-0.29.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB) Collecting murmurhash<1.1.0,>=1.0.2 - Using cached murmurhash-1.0.10-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB) + Using cached murmurhash-1.0.11.tar.gz (13 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting cymem<2.1.0,>=2.0.2 - Using cached cymem-2.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB) + Using cached cymem-2.0.10.tar.gz (10 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting preshed<3.1.0,>=3.0.2 Using cached preshed-3.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB) Collecting blis<1.1.0,>=1.0.0 @@ -164,7 +188,7 @@ Prerequisites Using cached setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB) Collecting cython>=0.25 Using cached Cython-3.0.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB) - ERROR: Ignored the following versions that require a different python version: 1.25.0 Requires-Python >=3.9; 1.25.1 Requires-Python >=3.9; 1.25.2 Requires-Python >=3.9; 1.26.0 Requires-Python <3.13,>=3.9; 1.26.1 Requires-Python <3.13,>=3.9; 1.26.2 Requires-Python >=3.9; 1.26.3 Requires-Python >=3.9; 1.26.4 Requires-Python >=3.9; 2.0.0 Requires-Python >=3.9; 2.0.1 Requires-Python >=3.9; 2.0.2 Requires-Python >=3.9; 2.1.0 Requires-Python >=3.10; 2.1.0rc1 Requires-Python >=3.10; 2.1.1 Requires-Python >=3.10; 2.1.2 Requires-Python >=3.10; 2.1.3 Requires-Python >=3.10; 75.4.0 Requires-Python >=3.9; 75.5.0 Requires-Python >=3.9; 75.6.0 Requires-Python >=3.9 + ERROR: Ignored the following versions that require a different python version: 1.25.0 Requires-Python >=3.9; 1.25.1 Requires-Python >=3.9; 1.25.2 Requires-Python >=3.9; 1.26.0 Requires-Python <3.13,>=3.9; 1.26.1 Requires-Python <3.13,>=3.9; 1.26.2 Requires-Python >=3.9; 1.26.3 Requires-Python >=3.9; 1.26.4 Requires-Python >=3.9; 2.0.0 Requires-Python >=3.9; 2.0.1 Requires-Python >=3.9; 2.0.2 Requires-Python >=3.9; 2.1.0 Requires-Python >=3.10; 2.1.0rc1 Requires-Python >=3.10; 2.1.1 Requires-Python >=3.10; 2.1.2 Requires-Python >=3.10; 2.1.3 Requires-Python >=3.10; 2.2.0 Requires-Python >=3.10; 2.2.0rc1 Requires-Python >=3.10; 75.4.0 Requires-Python >=3.9; 75.5.0 Requires-Python >=3.9; 75.6.0 Requires-Python >=3.9 ERROR: Could not find a version that satisfies the requirement numpy<3.0.0,>=2.0.0 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 1.13.3, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 1.19.5, 1.20.0, 1.20.1, 1.20.2, 1.20.3, 1.21.0, 1.21.1, 1.21.2, 1.21.3, 1.21.4, 1.21.5, 1.21.6, 1.22.0, 1.22.1, 1.22.2, 1.22.3, 1.22.4, 1.23.0, 1.23.1, 1.23.2, 1.23.3, 1.23.4, 1.23.5, 1.24.0, 1.24.1, 1.24.2, 1.24.3, 1.24.4) ERROR: No matching distribution found for numpy<3.0.0,>=2.0.0 @@ -499,25 +523,25 @@ Prepare image gallery .. parsed-literal:: - data/red_panda.png: 0%| | 0.00/50.6k [00:00 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/modeling_marian.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/modeling_marian.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if ( - Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19 and openvino version <= 2024.4. Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO. + model.safetensors: 0%| | 0.00/312M [00:00 0.19 and openvino version <= 2024.4. Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO. + model.safetensors: 100%|█████████████████████| 312M/312M [00:04<00:00, 71.1MB/s] .. code:: ipython3 @@ -888,10 +919,10 @@ support searching in Chinese. .. parsed-literal:: - 2024-11-22 01:36:43.187797: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:36:43.213112: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:26:01.092495: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:26:01.118195: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses. warnings.warn("Recommended: pip install sacremoses.") @@ -1123,13 +1154,13 @@ models can require different optimal threshold for search. .. parsed-literal:: - data/car-detection.mp4: 0%| | 0.00/2.68M [00:00`__ is a +“Model-as-a-Service” (MaaS) platform that seeks to bring together most +advanced machine learning models from the AI community, and to +streamline the process of leveraging AI models in real applications. +Hundreds of models are made publicly available on ModelScope (700+ and +counting), covering the latest development in areas such as NLP, CV, +Audio, Multi-modality, and AI for Science, etc. Many of these models +represent the SOTA in their specific fields, and made their open-sourced +debut on ModelScope. + +This tutorial covers how to use the modelscope ecosystem within +OpenVINO. + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Convert models from ModelScope using OpenVINO Model Conversion + API <#convert-models-from-modelscope-using-openvino-model-conversion-api>`__ + + - `Select inference device for image + classification <#select-inference-device-for-image-classification>`__ + - `Run Image classification <#run-image-classification>`__ + +- `Convert ModelScope models using Optimum + Intel <#convert-modelscope-models-using-optimum-intel>`__ + + - `Select inference device for text + classification <#select-inference-device-for-text-classification>`__ + - `Perform text classification <#perform-text-classification>`__ + +- `Convert ModelScope models for usage with OpenVINO + GenAI <#convert-modelscope-models-for-usage-with-openvino-genai>`__ + + - `Select inference device for text + generation <#select-inference-device-for-text-generation>`__ + - `Run OpenVINO GenAI pipeline <#run-openvino-genai-pipeline>`__ + +Prerequisites +------------- + + + +.. code:: ipython3 + + import platform + + %pip install -q "torch>=2.1.1" "torchvision" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q modelscope addict oss2 simplejson sortedcontainers pillow opencv-python "datasets<=3.0.0" + %pip install -q "transformers>=4.45" "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -qU "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" "nncf>=2.14.0" + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" + +.. code:: ipython3 + + import requests + from pathlib import Path + + if not Path("notebook_utils.py").exists(): + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + +Convert models from ModelScope using OpenVINO Model Conversion API +------------------------------------------------------------------ + + + +Modelscope package provides API for initializing a model and loading a +set of pre-trained weights using the model text handle. Discovering a +desired model name is straightforward with `Modelscope models web +page `__, one can choose a model +solving a particular machine learning problem and even sort the models +by popularity and novelty. + +OpenVINO supports various types of models and frameworks via conversion +to OpenVINO Intermediate Representation (IR). `OpenVINO model conversion +API `__ +should be used for these purposes. ``ov.convert_model`` function accepts +original model instance and example input for tracing and returns +``ov.Model`` representing this model in OpenVINO framework. Converted +model can be used for saving on disk using ``ov.save_model`` function or +directly loading on device using ``core.complie_model``. + +As example, we will use +`tinynas `__ +image classification model. The code bellow demonstrates how to load +this model using Modelscope pipelines interface, convert it to OpenVINO +IR and then perform image classification on specified device. + +.. code:: ipython3 + + from pathlib import Path + + from modelscope.pipelines import pipeline + from modelscope.utils.constant import Tasks + import openvino as ov + import torch + import gc + + + cls_model_id = "iic/cv_tinynas_classification" + cls_model_path = Path(cls_model_id.split("/")[-1]) / "openvino_model.xml" + + if not cls_model_path.exists(): + # load Modelcope pipeline with model + image_classification = pipeline(Tasks.image_classification, model=cls_model_id) + # convert model to OpenVINO + ov_model = ov.convert_model(image_classification.model, example_input=torch.zeros((1, 3, 224, 224)), input=[1, 3, 224, 224]) + # save OpenVINO model on disk for next usage + ov.save_model(ov_model, cls_model_path) + del ov_model + del image_classification + gc.collect(); + + +.. parsed-literal:: + + 2024-11-12 19:08:10.199148: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-11-12 19:08:10.212253: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered + WARNING: All log messages before absl::InitializeLog() is called are written to STDERR + E0000 00:00:1731424090.226654 1605757 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered + E0000 00:00:1731424090.230976 1605757 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered + 2024-11-12 19:08:10.246563: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + + +Select inference device for image classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + cv_cls_device = device_widget("CPU") + + cv_cls_device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +Run Image classification +~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Model inference interface remains compatible with pipeline preprocessing +and postprocessing, so you can reuse these part of pipeline, but for +providing standalone experience, we will demonstrate how to use model +without pipeline. The code bellow defines utilities for image +preprocessing and postprocessing. + +.. code:: ipython3 + + from notebook_utils import download_file + from PIL import Image + from torchvision import transforms + + # prepare input data and output lables + img_url = "https://pailitao-image-recog.oss-cn-zhangjiakou.aliyuncs.com/mufan/img_data/maas_test_data/dog.png" + img_path = Path("dog.png") + + labels_url = "https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/data/dataset_classes/imagenet_2012.txt" + + labels_path = Path("imagenet_2012.txt") + + if not img_path.exists(): + download_file(img_url) + + if not labels_path.exists(): + download_file(labels_url) + + image = Image.open(img_path) + imagenet_classes = labels_path.open("r").read().splitlines() + + + # prepare image preprocessing + transforms_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + transform_list = [ + transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms_normalize, + ] + transformer = transforms.Compose(transform_list) + + # compile model + core = ov.Core() + + ov_model = core.compile_model(cls_model_path, cv_cls_device.value) + +Now, when we make all necessary preparations, we can run model +inference. + +.. code:: ipython3 + + import numpy as np + + # preprocess input + image_tensor = transformer(image) + + # run model inference + result = ov_model(image_tensor.unsqueeze(0))[0] + + # postprocess results + label_id = np.argmax(result[0]) + score = result[0][label_id] + + label = imagenet_classes[label_id] + + # visualize results + display(image) + print(f"Predicted label: {label}, score {score}") + + + +.. image:: modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png + + +.. parsed-literal:: + + Predicted label: n02099601 golden retriever, score 8.060977935791016 + + +Convert ModelScope models using Optimum Intel +--------------------------------------------- + + + +For models compatible with the `HuggingFace +Transformers `__ +library, we can use `Optimum +Intel `__ integration +to convert and run model. Optimum Intel is the interface between the +Transformers and Diffusers libraries and the different tools and +libraries provided by Intel to accelerate end-to-end pipelines on Intel +architectures. + +Optimum Intel provides a simple interface for optimizing your +Transformers and Diffusers models, converting them to the OpenVINO +Intermediate Representation (IR) format, and running inference using +OpenVINO Runtime, among other use cases. For running ModelScope models +using this interface we should download model from hub first. There are +several ways how to download models from Modelscope Hub, one of them is +usage of ``modelscope.snapshot_download`` function. This function +accepts model id from hub and optionally local directory (if not +provided, model will be downloaded to cache directory). + +After that, we can load model to Optimum Intel interface replacing the +``AutoModelForXxx`` class from transformers with the corresponding +``OVModelForXxx``. Model conversion will be performed on the fly. For +avoiding next time conversion, we can save model on disk using +``save_pretrained`` method and in the next time pass directory with +already converted model as argument in ``from_pretrained`` method. We +also specified ``device`` parameter for compiling the model on the +specific device, if not provided, the default device will be used. The +device can be changed later in runtime using ``model.to(device)``, +please note that it may require some time for model compilation on a +newly selected device. In some cases, it can be useful to separate model +initialization and compilation, for example, if you want to reshape the +model using ``reshape`` method, you can postpone compilation, providing +the parameter ``compile=False`` into ``from_pretrained`` method, +compilation can be performed manually using ``compile`` method or will +be performed automatically during first inference run. + +As example, we will use +`nlp_bert_sentiment-analysis_english-base `__. +This model was trained for classification input text on 3 sentiment +categories: negative, positive and neutral. In transformers, +``AutoModelForSequenceClassification`` should be used for model +initialization, so for usage model with OpenVINO, it is enough just +replace ``AutoModelForSequenceClassification`` to +``OVModelForSequenceClassification``. + +.. code:: ipython3 + + from modelscope import snapshot_download + + text_model_id = "iic/nlp_bert_sentiment-analysis_english-base" + text_model_path = Path(text_model_id.split("/")[-1]) + ov_text_model_path = text_model_path / "ov" + + + if not text_model_path.exists(): + snapshot_download(text_model_id, local_dir=text_model_path) + +Select inference device for text classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + text_cls_device = device_widget("CPU", "NPU") + + text_cls_device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +Perform text classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from transformers import AutoTokenizer + from optimum.intel.openvino import OVModelForSequenceClassification + + + tokenizer = AutoTokenizer.from_pretrained(text_model_path) + + if not ov_text_model_path.exists(): + # model will be automatically exported to OpenVINO format during loading + ov_model = OVModelForSequenceClassification.from_pretrained(text_model_path, text_cls_device.value) + ov_model.save_pretrained(ov_text_model_path) + # save converted model using save_pretrained for avoid conversion in next time + tokenizer.save_pretrained(ov_text_model_path) + else: + # load converted model directly if availa ble + ov_model = OVModelForSequenceClassification.from_pretrained(ov_text_model_path, device=text_cls_device.value) + + # prepare input + input_text = "Good night." + input_data = tokenizer(input_text, return_tensors="pt") + + # run model inference + output = ov_model(**input_data) + # postprocess results + predicted_label_id = output.logits[0].argmax().item() + + predicted_label = ov_model.config.id2label[predicted_label_id] + + print(f"predicted label: {predicted_label}") + + +.. parsed-literal:: + + predicted label: Positive + + +Convert ModelScope models for usage with OpenVINO GenAI +------------------------------------------------------- + + + +OpenVINO™ GenAI is a library of the most popular Generative AI model +pipelines, optimized execution methods, and samples that run on top of +highly performant `OpenVINO +Runtime `__. + +This library is friendly to PC and laptop execution, and optimized for +resource consumption. It requires no external dependencies to run +generative models as it already includes all the core functionality +(e.g. tokenization via openvino-tokenizers). + +You can also load and run models from ModelScope with OpenVINO GenAI +`supported +pipelines `__. + +This inference approach is also based on model representation obtained +using Optimum Intel and also requires to download ModelScope model +first. As example we will be +`qwen2.5-1.5b-instruct `__ +model for text generation, that is part of powerful Qwen2 LLMs family. +If in previous chapter we are focused with usage python API for +downloading and converting models, in this one - we are also considering +CLI usage for the same actions. + +Downloading ModelScope models using CLI can be performed using following +command: + +.. code:: bash + + modelscope download --local_dir + +where ```` is model id from Hub and ```` is +output directory for model saving. + +``optimum-cli`` provides command line interface for exporting models +using Optimum. General OpenVINO export command format: + +.. code:: bash + + optimum-cli export openvino --model --task + +where task is task to export the model for. Available tasks depend on +the model, but are among: [‘default’, ‘fill-mask’, ‘text-generation’, +‘text2text-generation’, ‘text-classification’, ‘token-classification’, +‘multiple-choice’, ‘object-detection’, ‘question-answering’, +‘image-classification’, ‘image-segmentation’, ‘masked-im’, +‘semantic-segmentation’, ‘automatic-speech-recognition’, +‘audio-classification’, ‘audio-frame-classification’, +‘automatic-speech-recognition’, ‘audio-xvector’, ‘image-to-text’, +‘stable-diffusion’, ‘zero-shot-object-detection’]. + +You can find a mapping between tasks and model classes in Optimum +TaskManager +`documentation `__. + +Additionally, you can specify weights compression using +``--weight-format`` argument with one of following options: ``fp32``, +``fp16``, ``int8`` and ``int4``. Fro int8 and int4 nncf will be used for +weight compression. For models that required remote code execution, +``--trust-remote-code`` flag should be provided. + +Full list of supported arguments available via ``--help`` + +.. code:: ipython3 + + from IPython.display import Markdown, display + + model_id = "Qwen/Qwen2.5-1.5B-Instruct" + + llm_path = Path("Qwen2.5-1.5B-Instruct") + ov_llm_path = llm_path / "ov" + download_command = f"modelscope download {model_id} --local_dir {llm_path}" + display(Markdown("**Download command:**")) + display(Markdown(f"`{download_command}`")) + + if not llm_path.exists(): + !{download_command} + + + +**Download command:** + + + +``modelscope download Qwen/Qwen2.5-1.5B-Instruct --local_dir Qwen2.5-1.5B-Instruct`` + + +.. code:: ipython3 + + export_command = f"optimum-cli export openvino -m {llm_path} --task text-generation-with-past --weight-format int4 {ov_llm_path}" + display(Markdown("**Export command:**")) + display(Markdown(f"`{export_command}`")) + + if not ov_llm_path.exists(): + !{export_command} + + + +**Export command:** + + + +``optimum-cli export openvino -m Qwen2.5-1.5B-Instruct --task text-generation-with-past --weight-format int4 Qwen2.5-1.5B-Instruct/ov`` + + +Select inference device for text generation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + llm_device = device_widget("CPU") + + llm_device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +Run OpenVINO GenAI pipeline +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +For running text generation using OpenVINO GenAI, we should use +``LLMPipeline`` class initialized with providing converted model +directory and inference device. You can find more detailed example how +to use OpenVINO GenAI ``LLMPipeline`` for chatbot scenario in this +`tutorial `__. + +.. code:: ipython3 + + import openvino_genai as ov_genai + + + def streamer(subword): + print(subword, end="", flush=True) + # Return flag corresponds whether generation should be stopped. + # False means continue generation. + return False + + + llm_pipe = ov_genai.LLMPipeline(ov_llm_path, llm_device.value) + + llm_pipe.generate("The Sun is yellow because", max_new_tokens=200, streamer=streamer) + + +.. parsed-literal:: + + it has a spectrum of colors, and you are also looking at it. What color would the sun be if you could see its light without being able to see any other objects? If we imagine that someone had never seen or heard about the sun before, what would they expect to see? + + 1. **Color of the Sun**: The sun appears yellow when viewed from Earth due to the way our atmosphere scatters sunlight. This phenomenon occurs as follows: + + - **Sunlight Scattering**: When sunlight passes through the Earth's atmosphere, different wavelengths (colors) of light travel at slightly different speeds due to their varying energies. + - **Air Mass Height**: At higher altitudes where air density decreases with altitude, shorter wavelength (blue) photons have more energy and thus escape faster into space compared to longer wavelength (red) photons which remain in the atmosphere longer. + - **Sky Color**: As a result, blue light is scattered more than red light by molecules in the upper layers of the atmosphere + + + +.. parsed-literal:: + + " it has a spectrum of colors, and you are also looking at it. What color would the sun be if you could see its light without being able to see any other objects? If we imagine that someone had never seen or heard about the sun before, what would they expect to see?\n\n1. **Color of the Sun**: The sun appears yellow when viewed from Earth due to the way our atmosphere scatters sunlight. This phenomenon occurs as follows:\n\n - **Sunlight Scattering**: When sunlight passes through the Earth's atmosphere, different wavelengths (colors) of light travel at slightly different speeds due to their varying energies.\n - **Air Mass Height**: At higher altitudes where air density decreases with altitude, shorter wavelength (blue) photons have more energy and thus escape faster into space compared to longer wavelength (red) photons which remain in the atmosphere longer.\n - **Sky Color**: As a result, blue light is scattered more than red light by molecules in the upper layers of the atmosphere" + + + +.. code:: ipython3 + + import gc + + del llm_pipe + gc.collect(); diff --git a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg new file mode 100644 index 00000000000000..97ae56df8a8721 --- /dev/null +++ b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1745fd9f64ac9914621f7eee3668e86daa8121bc83d1a2c7f27963c85026f104 +size 66633 diff --git a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png new file mode 100644 index 00000000000000..d1c0d309736c1a --- /dev/null +++ b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6235ab7dd2cb4318435320004320ffc6de773044c51cadcd581a7996faca313a +size 636558 diff --git a/docs/notebooks/music-generation-with-output.rst b/docs/notebooks/music-generation-with-output.rst index a5bdcbd8049318..2d63515872694f 100644 --- a/docs/notebooks/music-generation-with-output.rst +++ b/docs/notebooks/music-generation-with-output.rst @@ -124,8 +124,8 @@ Imports .. parsed-literal:: - 2024-11-22 01:43:50.913766: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:43:50.938403: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:28:39.145741: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:28:39.170431: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -165,7 +165,7 @@ generate a text-conditioned music sample. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False) Config of the text_encoder: is overwritten by shared text_encoder config: T5Config { "_name_or_path": "t5-base", @@ -346,7 +346,7 @@ vocabulary. It helps the model understand the context of a sentence. @@ -431,7 +431,7 @@ runtime .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. @@ -775,7 +775,7 @@ We can now infer the pipeline backed by OpenVINO models. diff --git a/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst b/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst index 0bac7af3f39c32..9cefe7216f2076 100644 --- a/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst +++ b/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst @@ -204,8 +204,8 @@ documentation 1 or self.sliding_window is not None) and self.is_causal: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/exporters/onnx/model_patcher.py:306: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/exporters/onnx/model_patcher.py:306: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/qnguyen3/nanoLLaVA/13d60cec183a86755afed64da495fcc2c382ea80/modeling_llava_qwen2.py:939: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if seq_len > self.max_seq_len_cached: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/qnguyen3/nanoLLaVA/13d60cec183a86755afed64da495fcc2c382ea80/modeling_llava_qwen2.py:1499: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): @@ -530,10 +530,10 @@ image encoder model. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/quantization/quantize_model.py:432: FutureWarning: `CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/quantization/quantize_model.py:432: FutureWarning: `CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead. warning_deprecated( - 2024-11-22 01:48:49.764790: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:48:49.789684: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:33:42.983675: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:33:43.008813: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -742,8 +742,7 @@ can use the same tokenizer and image processor that provided with model. Question: Describe this image in detail Answer: - This image features a cute, white lama, possibly a llama, which is depicted in a playful pose. The llama is surrounded by a fire, indicating it's being set on a burner. The flame appears to be a bright, bright yellow, and there are several tiny flames, possibly from the llama's actions. - The llama itself is quite detailed. It has a small brown nose and dark eyes that are expressive. The face of the llama is quite detailed as well, with a pair of ears that are also light brown. The llama's mouth is open, revealing its pink lips. There are also small pink spots on its face, + The image features a white, fluffy lamb with a big, bright smile, standing next to a fire. The lamb's face is detailed, with black eyes that are slightly squinty, and a mouth that's slightly open. It seems to be enjoying the heat from the fire, as it is seen looking down. The lamb's legs are also visible, and they appear to be furry. The lamb's tail is long and appears to be fluffy as well. The lamb's ears are also visible and are pink. The lamb's face is depicted in detail, with small black eyes and black nostrils. The lamb's nose is also Interactive demo diff --git a/docs/notebooks/notebooks_with_binder_buttons.txt b/docs/notebooks/notebooks_with_binder_buttons.txt index ce9cb50da47907..58f31aaae508c8 100644 --- a/docs/notebooks/notebooks_with_binder_buttons.txt +++ b/docs/notebooks/notebooks_with_binder_buttons.txt @@ -7,7 +7,6 @@ convert-to-openvino cross-lingual-books-alignment depth-anything detectron2-to-openvino -distilbert-sequence-classification fast-segment-anything handwritten-ocr hello-detection diff --git a/docs/notebooks/notebooks_with_colab_buttons.txt b/docs/notebooks/notebooks_with_colab_buttons.txt index 59b3348a4c90f7..2361fbe9a19c69 100644 --- a/docs/notebooks/notebooks_with_colab_buttons.txt +++ b/docs/notebooks/notebooks_with_colab_buttons.txt @@ -1,5 +1,4 @@ 3D-segmentation-point-clouds -amused-lightweight-text-to-image async-api auto-device clip-language-saliency-map @@ -8,7 +7,6 @@ cross-lingual-books-alignment depth-anything depth-anything-v2 detectron2-to-openvino -distilbert-sequence-classification explainable-ai-1-basic explainable-ai-2-deep-dive explainable-ai-3-map-interpretation diff --git a/docs/notebooks/object-detection-with-output.rst b/docs/notebooks/object-detection-with-output.rst index 5debc4e7ed88d4..fc055f6e7ae63e 100644 --- a/docs/notebooks/object-detection-with-output.rst +++ b/docs/notebooks/object-detection-with-output.rst @@ -84,7 +84,7 @@ Install requirements .. parsed-literal:: - 24717 + 24624 @@ -136,21 +136,21 @@ Download and convert the Model .. parsed-literal:: - 100%|██████████| 6.25M/6.25M [00:00<00:00, 26.9MB/s] + 100%|██████████| 6.25M/6.25M [00:00<00:00, 26.8MB/s] .. parsed-literal:: - Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.2.2+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) + Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.4.1+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) YOLOv8n summary (fused): 168 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs PyTorch: starting from 'yolov8n.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 84, 8400) (6.2 MB) OpenVINO: starting export with openvino 2024.4.0-16579-c3152d32c9c-releases/2024/4... - OpenVINO: export success ✅ 1.4s, saved as 'yolov8n_openvino_model/' (6.4 MB) + OpenVINO: export success ✅ 1.3s, saved as 'yolov8n_openvino_model/' (6.4 MB) - Export complete (1.6s) - Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/object-detection-webcam + Export complete (1.5s) + Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/object-detection-webcam Predict: yolo predict task=detect model=yolov8n_openvino_model imgsz=640 half Validate: yolo val task=detect model=yolov8n_openvino_model imgsz=640 data=coco.yaml half Visualize: https://netron.app @@ -222,7 +222,7 @@ best performance. For that purpose, just use ``AUTO``. .. parsed-literal:: - Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.2.2+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) + Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.4.1+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) Loading yolov8n_openvino_model for OpenVINO inference... Using OpenVINO LATENCY mode for batch=1 inference... diff --git a/docs/notebooks/omniparser-with-output.rst b/docs/notebooks/omniparser-with-output.rst index 28676a03a84ba7..e22ce49105f78d 100644 --- a/docs/notebooks/omniparser-with-output.rst +++ b/docs/notebooks/omniparser-with-output.rst @@ -20,7 +20,6 @@ repo `__ and `model card `__. In this tutorial we consider how to run OmniParser using OpenVINO. - **Table of contents:** - `Prerequisites <#prerequisites>`__ @@ -72,9 +71,14 @@ Prerequisites .. code:: ipython3 - %pip install -q "torch>=2.1" easyocr torchvision accelerate "supervision==0.18.0" accelerate timm "einops==0.8.0" "ultralytics==8.1.24" pillow opencv-python "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu + import platform + + %pip install -q "torch>=2.1" easyocr torchvision accelerate "supervision==0.18.0" "transformers>=4.45" timm "einops==0.8.0" "ultralytics==8.1.24" pillow opencv-python "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu %pip install -q "openvino>=2024.4.0" + if platform.system() == "Darwin": + %pip install -q "numpy<2.0" + .. parsed-literal:: @@ -89,16 +93,21 @@ Prerequisites notebook_utils_path = Path("notebook_utils.py") florence_helper_path = Path("ov_florence2_helper.py") + omniparser_helper_path = Path("ov_omniparser_helper.py") if not notebook_utils_path.exists(): r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) - notebook_utils_path.open("w").write(r.text) + notebook_utils_path.open("w", encoding="utf-8").write(r.text) if not florence_helper_path.exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/florence2/ov_florence2_helper.py") - florence_helper_path.open("w").write(r.text) + florence_helper_path.open("w", encoding="utf-8").write(r.text) + + if not omniparser_helper_path.exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/omniparser/ov_omniparser_helper.py") + omniparser_helper_path.open("w", encoding="utf-8").write(r.text) Prepare models -------------- @@ -155,21 +164,21 @@ API. You can find more examples of this API usage in these .. parsed-literal:: - 2024-11-22 01:51:07.385705: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:51:07.410345: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:35:42.631431: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:35:42.657651: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. .. parsed-literal:: - weights/icon_detect/best.pt: 0%| | 0.00/11.7M [00:00=1.4.0, which is not installed. - mobileclip 0.1.0 requires torchvision==0.14.1, but you have torchvision 0.17.2+cpu which is incompatible. tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. tensorflow 2.12.0 requires tensorflow-estimator<2.13,>=2.12.0, but you have tensorflow-estimator 2.13.0 which is incompatible. tensorflow-cpu 2.13.1 requires numpy<=1.24.3,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow-cpu 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible. - torchvision 0.17.2+cpu requires torch==2.2.2, but you have torch 2.4.1 which is incompatible. Note: you may need to restart the kernel to use updated packages. @@ -250,9 +247,9 @@ True .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. WeightNorm.apply(module, name, dim) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device)) @@ -266,9 +263,9 @@ True .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/wavmark/__init__.py:16: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/wavmark/__init__.py:16: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(resume_path, map_location=torch.device('cpu')) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device)) @@ -418,38 +415,40 @@ documentation 0 - No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:283: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:283: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:346: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:346: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! pad_length = max(length - (self.window_size + 1), 0) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:347: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:347: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! slice_start_position = max((self.window_size + 1) - length, 0) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:349: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:349: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if pad_length > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:114: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:114: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if torch.min(inputs) < left or torch.max(inputs) > right: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:119: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:119: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if min_bin_width * num_bins > 1.0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:121: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:121: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if min_bin_height * num_bins > 1.0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:171: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:171: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert (discriminant >= 0).all() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: - %3293 : Float(1, 2, 43, strides=[86, 43, 1], requires_grad=0, device=cpu) = aten::randn(%3288, %3289, %3290, %3291, %3292) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 - %5559 : Float(1, 192, 153, strides=[29376, 1, 192], requires_grad=0, device=cpu) = aten::randn_like(%m_p, %5554, %5555, %5556, %5557, %5558) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: + %3293 : Float(1, 2, 43, strides=[86, 43, 1], requires_grad=0, device=cpu) = aten::randn(%3288, %3289, %3290, %3291, %3292) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 + %5559 : Float(1, 192, 150, strides=[28800, 1, 192], requires_grad=0, device=cpu) = aten::randn_like(%m_p, %5554, %5555, %5556, %5557, %5558) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace() _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 39680]) != torch.Size([1, 1, 38400]). - _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 2. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 155, 43]) != torch.Size([1, 1, 150, 43]). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: + Tensor-likes are not close! + + Mismatched elements: 38094 / 39424 (96.6%) + Greatest absolute difference: 0.7026380896568298 at index (0, 0, 4174) (up to 1e-05 allowed) + Greatest relative difference: 43899.56701030928 at index (0, 0, 2529) (up to 1e-05 allowed) _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 3. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 155]) != torch.Size([1, 1, 150]). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 2. of the traced function does not match the corresponding output of the Python function. Detailed error: + Tensor-likes are not close! + + Mismatched elements: 42 / 6622 (0.6%) + Greatest absolute difference: 1.0 at index (0, 0, 7, 1) (up to 1e-05 allowed) + Greatest relative difference: inf at index (0, 0, 7, 2) (up to 1e-05 allowed) _check_trace( @@ -483,16 +482,16 @@ documentation )`. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:836.) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1562: UserWarning: A window was not provided. A rectangular window will be applied,which is known to cause spectral leakage. Other windows such as torch.hann_window or torch.hamming_window can are recommended to reduce spectral leakage.To suppress this warning and use a rectangular window, explicitly set `window=torch.ones(n_fft, device=)`. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:836.) return forward_call(\*args, \*\*kwargs) @@ -720,7 +719,7 @@ Load speaker embeddings .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/functional.py:666: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/functional.py:666: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:873.) return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] @@ -875,7 +874,7 @@ And finally, run voice tone conversion with OpenVINO optimized model @@ -893,7 +892,7 @@ And finally, run voice tone conversion with OpenVINO optimized model @@ -1082,7 +1081,7 @@ voice tone conversion online. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/components/dropdown.py:100: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/components/dropdown.py:100: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False. warnings.warn( diff --git a/docs/notebooks/optical-character-recognition-with-output.rst b/docs/notebooks/optical-character-recognition-with-output.rst index 7dae2290312e68..764bad414c61e9 100644 --- a/docs/notebooks/optical-character-recognition-with-output.rst +++ b/docs/notebooks/optical-character-recognition-with-output.rst @@ -131,13 +131,13 @@ again. .. parsed-literal:: - model/horizontal-text-detection-0001/FP16/horizontal-text-detection-0001.bin: 0%| | 0.00/3.70M [00:… + horizontal-text-detection-0001.bin: 0%| | 0.00/3.70M [00:00 + @@ -375,7 +375,7 @@ may be specified is input data .. parsed-literal:: - + @@ -413,7 +413,7 @@ then such conversion will be added explicitly. .. parsed-literal:: - + @@ -575,7 +575,7 @@ Compare results on one image .. parsed-literal:: - data/imagenet_2012.txt: 0%| | 0.00/30.9k [00:00= 3.10. Please make + sure that your environment fulfill to this requirement before running + it + +`OuteTTS-0.1-350M `__ is +a novel text-to-speech synthesis model that leverages pure language +modeling without external adapters or complex architectures, built upon +the LLaMa architecture. It demonstrates that high-quality speech +synthesis is achievable through a straightforward approach using crafted +prompts and audio tokens. + +More details about model can be found in `original +repo `__. + +In this tutorial we consider how to run OuteTTS pipeline using OpenVINO. + + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Convert model <#convert-model>`__ +- `Run model inference <#run-model-inference>`__ + + - `Text-to-Speech generation <#text-to-speech-generation>`__ + - `Text-to-Speech generation with Voice + Cloning <#text-to-speech-generation-with-voice-cloning>`__ + +- `Interactive demo <#interactive-demo>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +.. code:: ipython3 + + import platform + + %pip install -q "torch>=2.1" "torchaudio" "einops" "transformers>=4.46.1" "loguru" "inflect" "pesq" "torchcrepe" "natsort" "polars" uroman mecab-python3 unidic-lite --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "gradio>=4.19" "openvino>=2024.4.0" "tqdm" "pyyaml" "librosa" "soundfile" + %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" + +.. code:: ipython3 + + import requests + from pathlib import Path + + utility_files = ["cmd_helper.py", "notebook_utils.py"] + base_utility_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" + + for utility_file in utility_files: + if not Path(utility_file).exists(): + r = requests.get(base_utility_url + utility_file) + with Path(utility_file).open("w") as f: + f.write(r.text) + + + helper_files = ["gradio_helper.py", "ov_outetts_helper.py"] + base_helper_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/outetts-text-to-speech" + + for helper_file in helper_files: + if not Path(helper_file).exists(): + r = requests.get(base_helper_url + helper_file) + with Path(helper_file).open("w") as f: + f.write(r.text) + +.. code:: ipython3 + + from cmd_helper import clone_repo + + repo_path = clone_repo("https://github.com/edwko/OuteTTS.git") + + interface_path = repo_path / "outetts/version/v1/interface.py" + + updated_version = interface_path.exists() + + if not updated_version: + interface_pth = repo_path / "outetts/v0_1/interface.py" + orig_interface_path = interface_path.parent / "_orig_interface.py" + + if not updated_version and not orig_interface_path.exists(): + interface_path.rename(orig_interface_path) + # sounddevice requires to install manually additional libraries, as we do not plan to use it for audio playing + # move it closer to its usage for avoid errors + with orig_interface_path.open("r") as in_file: + content = in_file.read() + upd_content = content.replace("import sounddevice as sd", "") + upd_content = upd_content.replace("sd.play", "import sounddevice as sd\n sd.play") + with interface_path.open("w") as out_file: + out_file.write(upd_content) + + %pip install -q {repo_path} --extra-index-url https://download.pytorch.org/whl/cpu + +Convert model +------------- + + + +OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate +Representation format. For convenience, we will use OpenVINO integration +with HuggingFace Optimum. `Optimum +Intel `__ is the +interface between the Transformers and Diffusers libraries and the +different tools and libraries provided by Intel to accelerate end-to-end +pipelines on Intel architectures. + +Among other use cases, Optimum Intel provides a simple interface to +optimize your Transformers and Diffusers models, convert them to the +OpenVINO Intermediate Representation (IR) format and run inference using +OpenVINO Runtime. ``optimum-cli`` provides command line interface for +model conversion and optimization. + +General command format: + +.. code:: bash + + optimum-cli export openvino --model --task + +where task is task to export the model for, if not specified, the task +will be auto-inferred based on the model. You can find a mapping between +tasks and model classes in Optimum TaskManager +`documentation `__. +Additionally, you can specify weights compression using +``--weight-format`` argument with one of following options: ``fp32``, +``fp16``, ``int8`` and ``int4``. Fro int8 and int4 +`nncf `__ will be used for +weight compression. More details about model export provided in `Optimum +Intel +documentation `__. + +As OuteTTS utilizes pure language modeling approach, model conversion +process remains the same like conversion LLaMa models family for text +generation purposes. + +.. code:: ipython3 + + from cmd_helper import optimum_cli + + model_id = "OuteAI/OuteTTS-0.1-350M" + model_dir = Path(model_id.split("/")[-1] + "-ov") + + if not model_dir.exists(): + optimum_cli(model_id, model_dir, additional_args={"task": "text-generation-with-past"}) + +Run model inference +------------------- + + + +OpenVINO integration with Optimum Intel provides ready-to-use API for +model inference that can be used for smooth integration with +transformers-based solutions. For loading model, we will use +``OVModelForCausalLM`` class that have compatible interface with +Transformers LLaMa implementation. For loading a model, +``from_pretrained`` method should be used. It accepts path to the model +directory or model_id from HuggingFace hub (if model is not converted to +OpenVINO format, conversion will be triggered automatically). +Additionally, we can provide an inference device, quantization config +(if model has not been quantized yet) and device-specific OpenVINO +Runtime configuration. More details about model inference with Optimum +Intel can be found in +`documentation `__. +We will use ``OVModelForCausalLM`` as replacement of original +``AutoModelForCausalLM`` in ``InterfaceHF``. + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget(exclude=["NPU"]) + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') + + + +.. code:: ipython3 + + from ov_outetts_helper import InterfaceOV, OVHFModel # noqa: F401 + + # Uncomment these lines to see pipeline details + # ??InterfaceOV + # ??OVHFModel + + +.. parsed-literal:: + + 2024-11-29 11:48:51.975233: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-11-29 11:48:51.989550: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered + WARNING: All log messages before absl::InitializeLog() is called are written to STDERR + E0000 00:00:1732866532.005718 2314480 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered + E0000 00:00:1732866532.010517 2314480 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered + 2024-11-29 11:48:52.027376: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + + +.. code:: ipython3 + + interface = InterfaceOV(model_dir, device.value) + + +.. parsed-literal:: + + making attention of type 'vanilla' with 768 in_channels + + +Text-to-Speech generation +~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Now let’s see model in action. Providing input text to ``generate`` +method of interface, model returns tensor that represents output audio +with random speaker characteristics. + +.. code:: ipython3 + + output = interface.generate(text="Hello, I'm working!", temperature=0.1, repetition_penalty=1.1, max_length=4096) + + +.. parsed-literal:: + + The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + Setting `pad_token_id` to `eos_token_id`:None for open-end generation. + The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + + +.. code:: ipython3 + + import IPython.display as ipd + + ipd.Audio(output.audio[0].numpy(), rate=output.sr) + + + + +.. raw:: html + + + + + + + +Text-to-Speech generation with Voice Cloning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Additionally, we can specify reference voice for generation by providing +reference audio and transcript for it. ``interface.create_speaker`` +processes reference audio and text to set of features used for audio +description. + +.. code:: ipython3 + + from notebook_utils import download_file + + ref_audio_url = "https://huggingface.co/OuteAI/OuteTTS-0.1-350M/resolve/main/samples/2.wav" + + file_path = download_file(ref_audio_url) + + +.. parsed-literal:: + + '2.wav' already exists. + + +.. code:: ipython3 + + ipd.Audio(file_path) + + + + +.. raw:: html + + + + + + + +.. code:: ipython3 + + speaker = interface.create_speaker(file_path, "Hello, I can speak pretty well, but sometimes I make some mistakes.") + + # Save the speaker to a file + interface.save_speaker(speaker, "speaker.pkl") + + # Load the speaker from a file + speaker = interface.load_speaker("speaker.pkl") + + # Generate TTS with the custom voice + output = interface.generate(text="This is a cloned voice speaking", speaker=speaker, temperature=0.1, repetition_penalty=1.1, max_length=4096) + + +.. parsed-literal:: + + The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + Setting `pad_token_id` to `eos_token_id`:None for open-end generation. + + +.. code:: ipython3 + + ipd.Audio(output.audio[0].numpy(), rate=output.sr) + + + + +.. raw:: html + + + + + + + +Interactive demo +---------------- + + + +.. code:: ipython3 + + from gradio_helper import make_demo + + demo = make_demo(interface) + + try: + demo.launch(debug=True) + except Exception: + demo.launch(share=True, debug=True) diff --git a/docs/notebooks/paddle-ocr-webcam-with-output.rst b/docs/notebooks/paddle-ocr-webcam-with-output.rst index 3fae2e47d99b24..aa054a40e73a07 100644 --- a/docs/notebooks/paddle-ocr-webcam-with-output.rst +++ b/docs/notebooks/paddle-ocr-webcam-with-output.rst @@ -214,7 +214,7 @@ Download the Model for Text **Detection** .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-no… + ch_PP-OCRv3_det_infer.tar: 0%| | 0.00/3.65M [00:00 + @@ -439,7 +439,7 @@ Note that many optimizations are possible to improve the performance. .. parsed-literal:: - PaddlePaddle model on CPU: 0.0069 seconds per image, FPS: 144.32 + PaddlePaddle model on CPU: 0.0071 seconds per image, FPS: 141.67 PaddlePaddle result: Labrador retriever, 0.75138 @@ -500,7 +500,7 @@ select device from dropdown list for running inference using OpenVINO .. parsed-literal:: - OpenVINO IR model in OpenVINO Runtime (AUTO): 0.0026 seconds per image, FPS: 380.57 + OpenVINO IR model in OpenVINO Runtime (AUTO): 0.0027 seconds per image, FPS: 376.00 OpenVINO result: Labrador retriever, 0.74909 diff --git a/docs/notebooks/parler-tts-text-to-speech-with-output.rst b/docs/notebooks/parler-tts-text-to-speech-with-output.rst index 323959aa17e8ef..2be3c2a4a2c7ed 100644 --- a/docs/notebooks/parler-tts-text-to-speech-with-output.rst +++ b/docs/notebooks/parler-tts-text-to-speech-with-output.rst @@ -9,7 +9,7 @@ with synthetic annotations `__ by Dan Lyth and Simon King, from Stability AI and Edinburgh University respectively. -.. image:: https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?format=2500w +|image0| Text-to-speech models trained on large-scale datasets have demonstrated impressive in-context learning capabilities and naturalness. However, @@ -53,6 +53,8 @@ need a Jupyter server to start. For details, please refer to `Installation Guide `__. +.. |image0| image:: https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?format=2500w + Prerequisites ------------- @@ -64,8 +66,32 @@ Prerequisites os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" + %pip uninstall -q -y torch torchvision torchaudio %pip install -q "openvino>=2024.2.0" - %pip install -q git+https://github.com/huggingface/parler-tts.git "gradio>=4.19" transformers "torch>=2.2" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q git+https://github.com/huggingface/parler-tts.git "gradio>=4.19" transformers "torch>=2.2" "torchaudio" --extra-index-url https://download.pytorch.org/whl/cpu + + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. + easyocr 1.7.2 requires torchvision>=0.5, which is not installed. + mobileclip 0.1.0 requires clip-benchmark>=1.4.0, which is not installed. + mobileclip 0.1.0 requires torchvision==0.14.1, which is not installed. + open-clip-torch 2.22.0 requires torchvision, which is not installed. + timm 1.0.12 requires torchvision, which is not installed. + ultralytics 8.1.24 requires torchvision>=0.9.0, which is not installed. + open-clip-torch 2.22.0 requires protobuf<4, but you have protobuf 4.25.5 which is incompatible. + tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. + tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. + tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. + tensorflow 2.12.0 requires tensorflow-estimator<2.13,>=2.12.0, but you have tensorflow-estimator 2.13.0 which is incompatible. + tensorflow-cpu 2.13.1 requires numpy<=1.24.3,>=1.22, but you have numpy 1.24.4 which is incompatible. + tensorflow-cpu 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible. + tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.25.5 which is incompatible. + Note: you may need to restart the kernel to use updated packages. + Load the original model and inference ------------------------------------- @@ -95,6 +121,135 @@ Load the original model and inference audio_arr = generation.cpu().numpy().squeeze() sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate) + +.. parsed-literal:: + + 2024-12-10 02:43:30.030324: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:43:30.055592: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + Flash attention 2 is not installed + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) + Config of the text_encoder: is overwritten by shared text_encoder config: T5Config { + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "transformers_version": "4.46.1", + "use_cache": true, + "vocab_size": 32128 + } + + Config of the audio_encoder: is overwritten by shared audio_encoder config: DACConfig { + "_name_or_path": "ylacombe/dac_44khZ_8kbps", + "architectures": [ + "DACModel" + ], + "codebook_size": 1024, + "frame_rate": 86, + "latent_dim": 1024, + "model_bitrate": 8, + "model_type": "dac_on_the_hub", + "num_codebooks": 9, + "sampling_rate": 44100, + "torch_dtype": "float32", + "transformers_version": "4.46.1" + } + + Config of the decoder: is overwritten by shared decoder config: ParlerTTSDecoderConfig { + "_name_or_path": "/fsx/yoach/tmp/artefacts/decoder_400M/", + "activation_dropout": 0.0, + "activation_function": "gelu", + "add_cross_attention": true, + "architectures": [ + "ParlerTTSForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1025, + "codebook_weights": null, + "cross_attention_implementation_strategy": null, + "dropout": 0.1, + "eos_token_id": 1024, + "ffn_dim": 4096, + "hidden_size": 1024, + "initializer_factor": 0.02, + "is_decoder": true, + "layerdrop": 0.0, + "max_position_embeddings": 4096, + "model_type": "parler_tts_decoder", + "num_attention_heads": 16, + "num_codebooks": 9, + "num_cross_attention_key_value_heads": 16, + "num_hidden_layers": 24, + "num_key_value_heads": 16, + "pad_token_id": 1024, + "rope_embeddings": false, + "rope_theta": 10000.0, + "scale_embedding": false, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.46.1", + "use_cache": true, + "use_fused_lm_heads": false, + "vocab_size": 1088 + } + + You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers + The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + + .. code:: ipython3 import IPython.display as ipd @@ -108,10 +263,10 @@ Load the original model and inference - + @@ -159,6 +314,20 @@ and Decoder (``ParlerTTSDecoder``). Lets convert them one by one. text_encoder_ov_model = convert(model.text_encoder, TEXT_ENCODER_OV_PATH, example_input) + +.. parsed-literal:: + + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + +.. parsed-literal:: + + [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + warnings.warn( + `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. + + The Decoder Model performs in generation pipeline and we can separate it into two stage. In the first stage the model generates ``past_key_values`` into output for the second stage. In the second @@ -193,6 +362,17 @@ stage the model produces tokens during several runs. decoder_1_ov_model = convert(DecoderStage1Wrapper(model.decoder.model.decoder), DECODER_STAGE_1_OV_PATH, example_input) + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/parler_tts/modeling_parler_tts.py:367: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if seq_len > self.weights.size(0): + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/parler_tts/modeling_parler_tts.py:1713: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if sequence_length != 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/parler_tts/modeling_parler_tts.py:916: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + + .. code:: ipython3 DECODER_STAGE_2_OV_PATH = Path("models/decoder_stage_2_ir.xml") @@ -231,6 +411,15 @@ stage the model produces tokens during several runs. decoder_2_ov_model = convert(DecoderStage2Wrapper(model.decoder.model.decoder), DECODER_STAGE_2_OV_PATH, example_input) + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + or len(self.key_cache[layer_idx]) == 0 # the layer has no cache + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors + + Compiling models and inference ------------------------------ @@ -258,7 +447,7 @@ Select device from dropdown list for running inference using OpenVINO. .. parsed-literal:: - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') @@ -360,10 +549,10 @@ and run inference. - + @@ -406,13 +595,27 @@ Interactive inference demo = make_demo(fn=infer) try: - demo.queue().launch(debug=True) + demo.queue().launch(debug=False) except Exception: - demo.queue().launch(share=True, debug=True) + demo.queue().launch(share=True, debug=False) # if you are launching remotely, specify server_name and server_port # demo.launch(server_name='your server name', server_port='server port in int') # Read more in the docs: https://gradio.app/docs/ + +.. parsed-literal:: + + Running on local URL: http://127.0.0.1:7860 + + To create a public link, set `share=True` in `launch()`. + + + + + + + + .. code:: ipython3 # please uncomment and run this cell for stopping gradio interface diff --git a/docs/notebooks/person-tracking-with-output.rst b/docs/notebooks/person-tracking-with-output.rst index 653a9b376edf7e..6ac8ff43e05ab2 100644 --- a/docs/notebooks/person-tracking-with-output.rst +++ b/docs/notebooks/person-tracking-with-output.rst @@ -148,7 +148,7 @@ Imports import collections from pathlib import Path import time - + import numpy as np import cv2 from IPython import display @@ -158,17 +158,17 @@ Imports .. code:: ipython3 # Import local modules - + if not Path("./notebook_utils.py").exists(): # Fetch `notebook_utils` module import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) - + open("notebook_utils.py", "w").write(r.text) - + import notebook_utils as utils from deepsort_utils.tracker import Tracker from deepsort_utils.nn_matching import NearestNeighborDistanceMetric @@ -200,36 +200,36 @@ by the cosine distance. .. code:: ipython3 from notebook_utils import download_ir_model - + # A directory where the model will be downloaded. base_model_dir = "model" precision = "FP16" # The name of the model from Open Model Zoo detection_model_name = "person-detection-0202" - - + + download_det_model_url = ( f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{detection_model_name}/{precision}/{detection_model_name}.xml" ) - + detection_model_path = download_ir_model(download_det_model_url, Path(base_model_dir) / detection_model_name / precision) - + reidentification_model_name = "person-reidentification-retail-0287" download_reid_model_url = f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{reidentification_model_name}/{precision}/{reidentification_model_name}.xml" - + reidentification_model_path = download_ir_model(download_reid_model_url, Path(base_model_dir) / reidentification_model_name / precision) .. parsed-literal:: - model/person-detection-0202/FP16/person-detection-0202.bin: 0%| | 0.00/3.47M [00:00 200: processing_times.popleft() - + _, f_width = frame.shape[:2] # Mean processing time [ms]. processing_time = np.mean(processing_times) * 1100 fps = 1000 / processing_time - + # Get poses from detection results. bbox_xywh, score, label = process_results(h, w, results=output) - + img_crops = [] for box in bbox_xywh: x1, y1, x2, y2 = xywh_to_xyxy(box, h, w) img = frame[y1:y2, x1:x2] img_crops.append(img) - + # Get reidentification feature of each person. if img_crops: # preprocess @@ -615,17 +614,17 @@ video file. features = extractor.predict(img_batch) else: features = np.array([]) - + # Wrap the detection and reidentification results together bbox_tlwh = xywh_to_tlwh(bbox_xywh) detections = [Detection(bbox_tlwh[i], features[i]) for i in range(features.shape[0])] - + # predict the position of tracking target tracker.predict() - + # update tracker tracker.update(detections) - + # update bbox identities outputs = [] for track in tracker.tracks: @@ -637,14 +636,14 @@ video file. outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int32)) if len(outputs) > 0: outputs = np.stack(outputs, axis=0) - + # draw box for visualization if len(outputs) > 0: bbox_tlwh = [] bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] frame = draw_boxes(frame, bbox_xyxy, identities) - + cv2.putText( img=frame, text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", @@ -655,7 +654,7 @@ video file. thickness=1, lineType=cv2.LINE_AA, ) - + if use_popup: cv2.imshow(winname=title, mat=frame) key = cv2.waitKey(1) @@ -670,7 +669,7 @@ video file. # Display the image in this notebook. display.clear_output(wait=True) display.display(i) - + # ctrl-c except KeyboardInterrupt: print("Interrupted") @@ -724,11 +723,11 @@ will work. .. code:: ipython3 USE_WEBCAM = False - + cam_id = 0 video_file = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" source = cam_id if USE_WEBCAM else video_file - + run_person_tracking(source=source, flip=USE_WEBCAM, use_popup=False) diff --git a/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png b/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png index f827c9c1094e46..972cc9e5977684 100644 --- a/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png +++ b/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dffde5665ae619cc99fddef72befb32d1002becce56dfccf50e7577f1fab020 -size 218904 +oid sha256:1c04ed0e53cb210bd7853d3daa7f77a0a087b8e08099b837d3237b025c223b5d +size 218593 diff --git a/docs/notebooks/phi-3-vision-with-output.rst b/docs/notebooks/phi-3-vision-with-output.rst index 71981daac13be4..dc588206768c93 100644 --- a/docs/notebooks/phi-3-vision-with-output.rst +++ b/docs/notebooks/phi-3-vision-with-output.rst @@ -260,8 +260,8 @@ documentation 1 or self.sliding_window is not None) and self.is_causal: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:444: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! seq_len = seq_len or torch.max(position_ids) + 1 /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if seq_len > self.original_max_position_embeddings: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. op1 = operator(\*args, \*\*kwargs) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:683: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): @@ -374,7 +365,7 @@ documentation =4.0.0, but you have protobuf 3.20.3 which is incompatible. + parler-tts 0.2.2 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. @@ -210,8 +210,8 @@ PhotoMaker to generate the original PhotoMaker pipeline. .. parsed-literal:: - 2024-11-22 02:03:50.933677: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 02:03:50.958255: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:49:18.726948: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:49:18.751780: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -230,6 +230,12 @@ PhotoMaker to generate the original PhotoMaker pipeline. Loading pipeline components...: 0%| | 0/7 [00:00 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: @@ -587,15 +584,15 @@ original Stable Diffusion XL model. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if dim % default_overall_up_factor != 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:146: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:146: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if hidden_states.shape[0] >= 64: diff --git a/docs/notebooks/pixart-with-output.rst b/docs/notebooks/pixart-with-output.rst index 517191e17501ef..fed1f6b3dada41 100644 --- a/docs/notebooks/pixart-with-output.rst +++ b/docs/notebooks/pixart-with-output.rst @@ -118,8 +118,8 @@ directly in latent space, achieving super fast inference with few steps. .. parsed-literal:: - 2024-11-22 02:11:50.540718: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 02:11:50.565755: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:57:23.724286: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:57:23.749610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -132,6 +132,8 @@ directly in latent space, achieving super fast inference with few steps. .. parsed-literal:: You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 + Some weights of the model checkpoint were not used when initializing PixArtTransformer2DModel: + ['caption_projection.y_embedding'] @@ -140,12 +142,6 @@ directly in latent space, achieving super fast inference with few steps. Loading checkpoint shards: 0%| | 0/4 [00:00= 64: @@ -452,7 +448,7 @@ And insert wrappers instances in the pipeline: .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) @@ -567,7 +563,7 @@ To collect intermediate model inputs for calibration we should customize .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) @@ -1625,16 +1621,16 @@ pipelines. Loading pipeline components...: 0%| | 0/5 [00:00 0.19 and openvino version <= 2024.4. Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO. - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 6% (1 / 281) │ 0% (0 / 280) │ - ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ 4 │ 94% (280 / 281) │ 100% (280 / 280) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:05:31 • 0:00:00 - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 6% (3 / 172) │ 0% (0 / 169) │ - ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ 4 │ 94% (169 / 172) │ 100% (169 / 169) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:12 • 0:00:00 - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (1 / 1) │ 0% (0 / 0) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:05 • 0:00:00 - + Traceback (most recent call last): + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/bin/optimum-cli", line 10, in + sys.exit(main()) + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/commands/optimum_cli.py", line 208, in main + service.run() + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/commands/export/openvino.py", line 390, in run + main_export( + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/exporters/openvino/__main__.py", line 476, in main_export + _weight_only_quantization(submodel, quantization_config) + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/intel/openvino/quantization.py", line 938, in _weight_only_quantization + return nncf.compress_weights( + TypeError: compress_weights() got an unexpected keyword argument 'backup_mode' + Run model inference ------------------- @@ -541,8 +528,8 @@ Intel can be found in .. parsed-literal:: - 2024-11-22 03:06:17.214277: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 03:06:17.240005: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 03:48:41.700649: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 03:48:41.726260: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -587,7 +574,7 @@ Intel can be found in .. parsed-literal:: - The unusual aspect of this image is that the cat is lying inside a cardboard box, which is not a typical setting for a cat. Cats are often known for their affinity for boxes, but it is still considered unusual to see a cat comfortably resting inside a box in a living room setting. The cat appears relaxed and content, which adds to the charm of the scene. The presence of a sofa in the background further emphasizes the domestic and cozy atmosphere of the image. + The unusual aspect of this image is that the cat is lying on its back inside a cardboard box. This is not a typical position for a cat, as they usually prefer to curl up or lie on their sides when resting. Additionally, cats are known for their love of small, enclosed spaces, but it is less common to see a cat lying on its back in such a setting. The image captures a playful and relaxed moment, highlighting the cat's comfort and curiosity. Interactive demo diff --git a/docs/notebooks/pose-estimation-with-output.rst b/docs/notebooks/pose-estimation-with-output.rst index e827bd19acfd34..112b6037d4907f 100644 --- a/docs/notebooks/pose-estimation-with-output.rst +++ b/docs/notebooks/pose-estimation-with-output.rst @@ -126,13 +126,13 @@ precision in the code below. .. parsed-literal:: - model/intel/human-pose-estimation-0001/FP16-INT8/human-pose-estimation-0001.xml: 0%| | 0.00/474k [0… + human-pose-estimation-0001.xml: 0%| | 0.00/474k [00:00 target_length: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors diff --git a/docs/notebooks/qwen2-vl-with-output.rst b/docs/notebooks/qwen2-vl-with-output.rst index d9c51a151e5926..ea0541fe1610a7 100644 --- a/docs/notebooks/qwen2-vl-with-output.rst +++ b/docs/notebooks/qwen2-vl-with-output.rst @@ -55,10 +55,8 @@ In this tutorial we consider how to convert and optimize Qwen2VL model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using -`NNCF `__ - - -**Table of contents:** +`NNCF `__ #### Table of +contents: - `Prerequisites <#prerequisites>`__ - `Select model <#select-model>`__ @@ -106,11 +104,11 @@ Prerequisites from pathlib import Path import requests - + if not Path("ov_qwen2_vl.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/ov_qwen2_vl.py") open("ov_qwen2_vl.py", "w").write(r.text) - + if not Path("notebook_utils.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") open("notebook_utils.py", "w").write(r.text) @@ -128,9 +126,9 @@ using widget bellow: .. code:: ipython3 from ov_qwen2_vl import model_selector - + model_id = model_selector() - + model_id @@ -141,8 +139,8 @@ using widget bellow: .. parsed-literal:: - 2024-11-22 04:16:41.832996: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:16:41.858520: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:00:06.245590: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:00:06.272261: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -287,20 +285,20 @@ documentation target_length: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors @@ -442,7 +431,7 @@ Intel `__ .. code:: ipython3 from ov_qwen2_vl import OVQwen2VLModel - + # Uncomment below lines to see the model inference class code # OVQwen2VLModel?? @@ -454,9 +443,9 @@ Select inference device .. code:: ipython3 from notebook_utils import device_widget - + device = device_widget(default="AUTO", exclude=["NPU"]) - + device @@ -483,25 +472,25 @@ Run model inference from transformers import AutoProcessor, AutoTokenizer from qwen_vl_utils import process_vision_info from transformers import TextStreamer - - + + min_pixels = 256 * 28 * 28 max_pixels = 1280 * 28 * 28 processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels) - + if processor.chat_template is None: tok = AutoTokenizer.from_pretrained(model_dir) processor.chat_template = tok.chat_template - + example_image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" example_image_path = Path("demo.jpeg") - + if not example_image_path.exists(): Image.open(requests.get(example_image_url, stream=True).raw).save(example_image_path) - + image = Image.open(example_image_path) question = "Describe this image." - + messages = [ { "role": "user", @@ -514,7 +503,7 @@ Run model inference ], } ] - + # Preparation for inference text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) @@ -525,12 +514,12 @@ Run model inference padding=True, return_tensors="pt", ) - + display(image) print("Question:") print(question) print("Answer:") - + generated_ids = model.generate(**inputs, max_new_tokens=100, streamer=TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)) @@ -573,10 +562,10 @@ click ``Submit`` to start communication. .. code:: ipython3 from gradio_helper import make_demo - - + + demo = make_demo(model, processor) - + try: demo.launch(debug=False) except Exception: @@ -589,9 +578,9 @@ click ``Submit`` to start communication. .. parsed-literal:: Running on local URL: http://127.0.0.1:7860 - + Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB - + To create a public link, set `share=True` in `launch()`. diff --git a/docs/notebooks/rmbg-background-removal-with-output.rst b/docs/notebooks/rmbg-background-removal-with-output.rst index c2e7286cc35cb4..0961afb2bf1ef5 100644 --- a/docs/notebooks/rmbg-background-removal-with-output.rst +++ b/docs/notebooks/rmbg-background-removal-with-output.rst @@ -112,8 +112,8 @@ it may take some time. .. parsed-literal:: - 2024-11-22 04:19:11.305790: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:19:11.330949: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:02:42.657474: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:02:42.682685: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -240,7 +240,7 @@ function or directly loading on device using ``core.complie_model``. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. diff --git a/docs/notebooks/segment-anything-2-image-with-output.rst b/docs/notebooks/segment-anything-2-image-with-output.rst index 1e938df4a9763a..d9b24bf720325b 100644 --- a/docs/notebooks/segment-anything-2-image-with-output.rst +++ b/docs/notebooks/segment-anything-2-image-with-output.rst @@ -120,24 +120,20 @@ Prerequisites .. parsed-literal:: - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - mobileclip 0.1.0 requires clip-benchmark>=1.4.0, which is not installed. - mobileclip 0.1.0 requires torchvision==0.14.1, but you have torchvision 0.19.1+cpu which is incompatible. - parler-tts 0.2.1 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. Note: you may need to restart the kernel to use updated packages. Collecting iopath>=0.1.10 Using cached iopath-0.1.10-py3-none-any.whl - Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) - Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) - Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.0) - Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) - Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) - Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) - Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) - Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) - Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) - Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) + Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) + Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) + Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.1) + Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) + Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) + Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) + Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) + Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) + Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) + Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) Installing collected packages: iopath Attempting uninstall: iopath Found existing installation: iopath 0.1.9 @@ -190,10 +186,10 @@ Clone and install segment-anything-2 .. parsed-literal:: env: SAM2_BUILD_CUDA=0 - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation/sam2 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation/sam2 ERROR: Package 'sam-2' requires a different Python: 3.8.10 not in '>=3.10.0' Note: you may need to restart the kernel to use updated packages. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation .. code:: ipython3 @@ -397,7 +393,8 @@ Mask prediction will be includes two models: * **Prompt Encoder** - Encoder for segmentation condition. As a condition can be used points, boxes or segmentation mask. -* **Mask Decoder** - The mask decoder efficiently maps the image embedding, prompt embeddings, and an output +* **Mask Decoder** - The mask decoder + efficiently maps the image embedding, prompt embeddings, and an output token to a mask. Combined prompt encoder and mask decoder model has following list of @@ -488,12 +485,6 @@ Example Image image = cv2.imread("truck.jpg") image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - -.. parsed-literal:: - - 'truck.jpg' already exists. - - .. code:: ipython3 plt.figure(figsize=(10, 10)) diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png new file mode 100644 index 00000000000000..343e5ecc49fc50 --- /dev/null +++ b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:520c7390de98864c4ae6b24b940230e83f2b5fc0b1723d58ed9941cc2d9bc70f +size 469439 diff --git a/docs/notebooks/segment-anything-2-video-with-output.rst b/docs/notebooks/segment-anything-2-video-with-output.rst index 20aae9f8a5e3f9..dec5f3d63f341e 100644 --- a/docs/notebooks/segment-anything-2-video-with-output.rst +++ b/docs/notebooks/segment-anything-2-video-with-output.rst @@ -110,18 +110,18 @@ Prerequisites .. parsed-literal:: Note: you may need to restart the kernel to use updated packages. - Requirement already satisfied: iopath>=0.1.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.1.10) - Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) - Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) - Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.0) - Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) - Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) - Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) - Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) - Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) - Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) - Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) + Requirement already satisfied: iopath>=0.1.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.1.10) + Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) + Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) + Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.1) + Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) + Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) + Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) + Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) + Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) + Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) + Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -174,7 +174,7 @@ Clone and install segment-anything-2 .. parsed-literal:: env: SAM2_BUILD_CUDA=0 - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation/sam2 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation/sam2 .. parsed-literal:: @@ -203,7 +203,7 @@ Clone and install segment-anything-2 ERROR: Package 'sam-2' requires a different Python: 3.8.10 not in '>=3.10.0' Note: you may need to restart the kernel to use updated packages. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation .. code:: ipython3 @@ -601,14 +601,14 @@ Prepare data .. parsed-literal:: - data/coco.mp4: 0%| | 0.00/877k [00:00 - + Your browser does not support the video tag. @@ -840,7 +840,7 @@ Example with box .. parsed-literal:: - frame loading (JPEG): 100%|██████████| 50/50 [00:00<00:00, 52.72it/s] + frame loading (JPEG): 100%|██████████| 25/25 [00:00<00:00, 54.66it/s] @@ -877,7 +877,7 @@ Example with box .. parsed-literal:: - propagate in video: 100%|██████████| 50/50 [07:47<00:00, 9.35s/it] + propagate in video: 100%|██████████| 25/25 [03:37<00:00, 8.71s/it] .. code:: ipython3 @@ -894,7 +894,7 @@ Example with box .. raw:: html @@ -927,7 +927,7 @@ Run Interactive For Video Segmentation with Gradio .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.40.0, however version 4.44.1 is available, please upgrade. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.40.0, however version 4.44.1 is available, please upgrade. -------- warnings.warn( diff --git a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png index 5721f78113b9a5..8b2efbd6f030df 100644 --- a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png +++ b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dce79554325cf25434872511d2c96b1361ab4a3e14f23a936d227177ee98836f -size 193590 +oid sha256:bdf6f36d230ce5b74e070f0abb2e3672a1ae3f31094c2444a0e0623b95f1bf35 +size 193591 diff --git a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png index e18f213004313f..65df892bd6e8c2 100644 --- a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png +++ b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00bfe0191dd680f768ea740c00fc4e6d9054e72c250971fc8e12807159a26644 -size 190195 +oid sha256:6b3a974acb951d94d941f150b640a1dcce172f6974085774adbf06e22adeb386 +size 190202 diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst b/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst index a38b7c56a2ec8a..a1738642568a2b 100644 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst +++ b/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst @@ -120,8 +120,8 @@ tokenizer and preparing the images. .. parsed-literal:: - 2024-11-22 04:41:05.723109: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:41:05.748466: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:15:56.596890: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:15:56.621776: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -258,7 +258,7 @@ object ready to load on the device and start making predictions. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. @@ -604,7 +604,7 @@ model are similar to the PyTorch model. .. parsed-literal:: - [{'dog': 0.99}, {'horse': 0.0}, {'cat': 0.0}, {'wolf': 0.0}, {'tiger': 0.0}] + [{'dog': 0.99}, {'horse': 0.0}, {'cat': 0.0}, {'wolf': 0.0}, {'frog': 0.0}] @@ -679,7 +679,7 @@ approximately estimate the speed up of the dynamic quantized models. .. parsed-literal:: - Performance speed up: 2.016 + Performance speed up: 1.907 Interactive inference diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png b/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png index 611278a49d1583..6e5afc5acf92a6 100644 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png +++ b/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f15546e58fac808ed62a6fcc29f2b58b48a974070a9d8c0b5c199c533b747d0 -size 580998 +oid sha256:3ebc30e695ed16710b909a552137d214ca9defb109984e4da59e8b684ce59427 +size 581000 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst index e9fcfb3f8baa9f..a5b31e15d97ec2 100644 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst +++ b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst @@ -61,8 +61,8 @@ and install required packages. ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. modelscope-studio 0.5.2 requires gradio<6.0,>=4.0, but you have gradio 3.43.1 which is incompatible. - parler-tts 0.2.1 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. - parler-tts 0.2.1 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. + parler-tts 0.2.2 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. + parler-tts 0.2.2 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. Note: you may need to restart the kernel to use updated packages. @@ -121,7 +121,7 @@ and install required packages. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/img2img-turbo + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/img2img-turbo Load PyTorch model @@ -381,10 +381,10 @@ diagram indicate trainable layers. Semi-transparent layers are frozen. .. parsed-literal:: - 2024-11-22 04:46:27.445712: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:46:27.471919: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:21:48.209793: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:21:48.234621: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. torch.utils._pytree._register_pytree_node( @@ -402,7 +402,7 @@ diagram indicate trainable layers. Semi-transparent layers are frozen. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( @@ -413,8 +413,8 @@ diagram indicate trainable layers. Semi-transparent layers are frozen. .. parsed-literal:: - 100%|██████████| 525M/525M [18:17<00:00, 478kiB/s] - /tmp/ipykernel_3576883/2531017353.py:172: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + 100%|██████████| 525M/525M [07:34<00:00, 1.15MiB/s] + /tmp/ipykernel_2241734/2531017353.py:172: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. sd = torch.load(p_ckpt, map_location="cpu") @@ -473,30 +473,30 @@ on disk using ``ov.save_model`` in compressed to FP16 format. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if input_shape[-1] > 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:135: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:135: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:144: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:144: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unet_2d_condition.py:915: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unet_2d_condition.py:915: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if dim % default_overall_up_factor != 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:149: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:149: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:165: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:165: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if hidden_states.shape[0] >= 64: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:433: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:433: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:440: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:440: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:479: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:479: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if t > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:330: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:330: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one @@ -676,17 +676,17 @@ Download results using download button .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/gradio_helper.py:225: GradioDeprecationWarning: 'scale' value should be an integer. Using 0.4 will cause issues. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/gradio_helper.py:225: GradioDeprecationWarning: 'scale' value should be an integer. Using 0.4 will cause issues. with gr.Column(elem_id="column_process", min_width=50, scale=0.4): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:776: UserWarning: Expected 1 arguments for function . at 0x7f22fbf5a550>, received 0. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:776: UserWarning: Expected 1 arguments for function . at 0x7fafe0603c10>, received 0. warnings.warn( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:780: UserWarning: Expected at least 1 arguments for function . at 0x7f22fbf5a550>, received 0. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:780: UserWarning: Expected at least 1 arguments for function . at 0x7fafe0603c10>, received 0. warnings.warn( .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo Running on local URL: http://127.0.0.1:7860 To create a public link, set `share=True` in `launch()`. diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg index 9d8436d4e8894f..a054eb11c32455 100644 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg +++ b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c28978a7a34769c09d64918686fb69f0239eb9f6499e590a86af16ca1a416d4 -size 23636 +oid sha256:7f92cbd6bb14242b47d354389a04e3413c94c46d233e71b73e305bfb73085a10 +size 23649 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png index cacdc0c183ea23..336a9ae38fa096 100644 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png +++ b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5acfcf35473541de444c0a2edbfec36423f37335fecf6844a179c65530a6b54 -size 303319 +oid sha256:179009716266de8c220bfe9b7b3d64410061f8ae8bf74a08305655c020cde76f +size 303164 diff --git a/docs/notebooks/sparsity-optimization-with-output.rst b/docs/notebooks/sparsity-optimization-with-output.rst index 8d3779621fb2ec..038a8db6aec1b1 100644 --- a/docs/notebooks/sparsity-optimization-with-output.rst +++ b/docs/notebooks/sparsity-optimization-with-output.rst @@ -82,8 +82,8 @@ Imports .. parsed-literal:: - 2024-11-22 05:06:26.947305: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:06:26.972806: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:31:08.167081: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:31:08.192294: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -202,7 +202,7 @@ as an example. It is recommended to tune based on your applications. [ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 68.94 ms + [ INFO ] Read model took 72.79 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [?,?] @@ -213,7 +213,7 @@ as an example. It is recommended to tune based on your applications. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: 'input_ids': [1,64], 'attention_mask': [1,64], 'token_type_ids': [1,64] - [ INFO ] Reshape model took 28.06 ms + [ INFO ] Reshape model took 27.96 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,64] @@ -222,7 +222,7 @@ as an example. It is recommended to tune based on your applications. [ INFO ] Model outputs: [ INFO ] logits (node: logits) : f32 / [...] / [1,2] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 999.63 ms + [ INFO ] Compile model took 1082.12 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: torch_jit @@ -254,17 +254,17 @@ as an example. It is recommended to tune based on your applications. [ INFO ] Fill input 'token_type_ids' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests, limits: 60000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 27.20 ms + [ INFO ] First inference took 28.08 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] [ INFO ] Count: 9176 iterations - [ INFO ] Duration: 60047.45 ms + [ INFO ] Duration: 60033.51 ms [ INFO ] Latency: [ INFO ] Median: 25.83 ms - [ INFO ] Average: 25.91 ms - [ INFO ] Min: 24.30 ms - [ INFO ] Max: 37.67 ms - [ INFO ] Throughput: 152.81 FPS + [ INFO ] Average: 25.92 ms + [ INFO ] Min: 23.43 ms + [ INFO ] Max: 42.58 ms + [ INFO ] Throughput: 152.85 FPS Benchmark quantized sparse inference performance @@ -321,7 +321,7 @@ for which a layer will be enabled. [ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 71.97 ms + [ INFO ] Read model took 75.90 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [?,?] @@ -332,7 +332,7 @@ for which a layer will be enabled. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: 'input_ids': [1,64], 'attention_mask': [1,64], 'token_type_ids': [1,64] - [ INFO ] Reshape model took 28.33 ms + [ INFO ] Reshape model took 28.30 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,64] @@ -341,7 +341,7 @@ for which a layer will be enabled. [ INFO ] Model outputs: [ INFO ] logits (node: logits) : f32 / [...] / [1,2] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1001.30 ms + [ INFO ] Compile model took 1011.04 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: torch_jit @@ -373,17 +373,17 @@ for which a layer will be enabled. [ INFO ] Fill input 'token_type_ids' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests, limits: 60000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 28.02 ms + [ INFO ] First inference took 27.34 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 9216 iterations - [ INFO ] Duration: 60030.33 ms + [ INFO ] Count: 9152 iterations + [ INFO ] Duration: 60027.24 ms [ INFO ] Latency: - [ INFO ] Median: 25.92 ms - [ INFO ] Average: 25.94 ms - [ INFO ] Min: 23.04 ms - [ INFO ] Max: 31.17 ms - [ INFO ] Throughput: 153.52 FPS + [ INFO ] Median: 25.91 ms + [ INFO ] Average: 25.97 ms + [ INFO ] Min: 23.89 ms + [ INFO ] Max: 41.37 ms + [ INFO ] Throughput: 152.46 FPS When this might be helpful diff --git a/docs/notebooks/speculative-sampling-with-output.rst b/docs/notebooks/speculative-sampling-with-output.rst index 4d5656cb99645c..868fbe9beccf9e 100644 --- a/docs/notebooks/speculative-sampling-with-output.rst +++ b/docs/notebooks/speculative-sampling-with-output.rst @@ -214,7 +214,23 @@ generation is finished, we will write streamer function. pipe = ov_genai.LLMPipeline(target_model_path, device.value) config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 + config.max_new_tokens = 330 + prompt = ''' + + def prime_fib(n: int): + """ + prime_fib returns n-th number that is a Fibonacci number and it's also prime. + >>> prime_fib(1) + 2 + >>> prime_fib(2) + 3 + >>> prime_fib(3) + 5 + >>> prime_fib(4) + 13 + >>> prime_fib(5) + 89 + """''' def streamer(subword): @@ -225,7 +241,7 @@ generation is finished, we will write streamer function. start_time = time.perf_counter() - pipe.generate(["Sun is yellow because"], config, streamer=streamer) + pipe.generate(prompt, config, streamer=streamer) end_time = time.perf_counter() @@ -239,7 +255,7 @@ generation is finished, we will write streamer function. print(f"Generation time: {end_time - start_time:.2f}s") del pipe - gc.collect(); + gc.collect() .. parsed-literal:: @@ -282,17 +298,19 @@ stops the current token generation iteration is not yet reached. scheduler_config = ov_genai.SchedulerConfig() # cache params - scheduler_config.cache_size = 2 + scheduler_config.cache_size = 0 + scheduler_config.num_kv_blocks = 2048 // 8 + scheduler_config.max_num_batched_tokens = 2048 draft_model = ov_genai.draft_model(draft_model_path, device.value) pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config) config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 - config.num_assistant_tokens = 3 + config.max_new_tokens = 330 + config.num_assistant_tokens = 5 start_time = time.perf_counter() - result = pipe.generate(["Sun is yellow because"], config, streamer=streamer) + result = pipe.generate(prompt, config, streamer=streamer) end_time = time.perf_counter() diff --git a/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst b/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst index 0b9b8db99880b6..27fad907b62fd6 100644 --- a/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst +++ b/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst @@ -57,47 +57,47 @@ Guide =0.11.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.5.2) - Requirement already satisfied: torch>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2.4.1+cpu) - Requirement already satisfied: filelock in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.16.1) - Requirement already satisfied: numpy>=1.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (1.24.4) - Requirement already satisfied: pyarrow>=15.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (17.0.0) - Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.3.8) - Requirement already satisfied: pandas in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.0.3) - Requirement already satisfied: requests>=2.32.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.32.3) - Requirement already satisfied: tqdm>=4.66.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (4.67.0) - Requirement already satisfied: xxhash in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.5.0) - Requirement already satisfied: multiprocess in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.70.16) - Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1) - Requirement already satisfied: aiohttp in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.10.11) - Requirement already satisfied: huggingface-hub>=0.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.25.2) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (24.2) - Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (6.0.2) - Requirement already satisfied: lightning-utilities>=0.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (0.11.9) - Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (4.12.2) - Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (1.13.3) - Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1.4) - Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (2.4.3) - Requirement already satisfied: aiosignal>=1.1.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1) - Requirement already satisfied: attrs>=17.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (24.2.0) - Requirement already satisfied: frozenlist>=1.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.5.0) - Requirement already satisfied: multidict<7.0,>=4.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (6.1.0) - Requirement already satisfied: yarl<2.0,>=1.12.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.15.2) - Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (5.0.1) - Requirement already satisfied: setuptools in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from lightning-utilities>=0.8.0->torchmetrics>=0.11.0) (75.3.0) - Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.4.0) - Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.10) - Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2.2.3) - Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2024.8.30) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch>=2.1.0) (2.1.5) - Requirement already satisfied: python-dateutil>=2.8.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2.9.0.post0) - Requirement already satisfied: pytz>=2020.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) - Requirement already satisfied: tzdata>=2022.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) - Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch>=2.1.0) (1.3.0) - Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0) - Requirement already satisfied: propcache>=0.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0) + Requirement already satisfied: datasets in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (3.0.0) + Requirement already satisfied: torchmetrics>=0.11.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.5.2) + Requirement already satisfied: torch>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2.4.1+cpu) + Requirement already satisfied: filelock in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.16.1) + Requirement already satisfied: numpy>=1.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (1.24.4) + Requirement already satisfied: pyarrow>=15.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (17.0.0) + Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.3.8) + Requirement already satisfied: pandas in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.0.3) + Requirement already satisfied: requests>=2.32.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.32.3) + Requirement already satisfied: tqdm>=4.66.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (4.67.1) + Requirement already satisfied: xxhash in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.5.0) + Requirement already satisfied: multiprocess in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.70.16) + Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1) + Requirement already satisfied: aiohttp in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.10.11) + Requirement already satisfied: huggingface-hub>=0.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.25.2) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (24.2) + Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (6.0.2) + Requirement already satisfied: lightning-utilities>=0.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (0.11.9) + Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (4.12.2) + Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (1.13.3) + Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1) + Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1.4) + Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (2.4.4) + Requirement already satisfied: aiosignal>=1.1.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1) + Requirement already satisfied: attrs>=17.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (24.2.0) + Requirement already satisfied: frozenlist>=1.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.5.0) + Requirement already satisfied: multidict<7.0,>=4.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (6.1.0) + Requirement already satisfied: yarl<2.0,>=1.12.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.15.2) + Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (5.0.1) + Requirement already satisfied: setuptools in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from lightning-utilities>=0.8.0->torchmetrics>=0.11.0) (75.3.0) + Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.4.0) + Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.10) + Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2.2.3) + Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2024.8.30) + Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch>=2.1.0) (2.1.5) + Requirement already satisfied: python-dateutil>=2.8.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2.9.0.post0) + Requirement already satisfied: pytz>=2020.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) + Requirement already satisfied: tzdata>=2022.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) + Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch>=2.1.0) (1.3.0) + Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0) + Requirement already satisfied: propcache>=0.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0) Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -119,8 +119,8 @@ Imports .. parsed-literal:: - 2024-11-22 05:08:52.722966: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:08:52.748262: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:33:33.150578: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:33:33.175323: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -177,10 +177,10 @@ IR). .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): @@ -507,7 +507,7 @@ quantized model. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:62: FutureWarning: Importing `WordErrorRate` from `torchmetrics` was deprecated and will be removed in 2.0. Import `WordErrorRate` from `torchmetrics.text` instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:62: FutureWarning: Importing `WordErrorRate` from `torchmetrics` was deprecated and will be removed in 2.0. Import `WordErrorRate` from `torchmetrics.text` instead. _future_warning( @@ -577,7 +577,7 @@ models. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 18.23 ms + [ INFO ] Read model took 17.68 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [?,?] @@ -586,14 +586,14 @@ models. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: '45': [1,30480] - [ INFO ] Reshape model took 4.39 ms + [ INFO ] Reshape model took 4.15 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [1,30480] [ INFO ] Model outputs: [ INFO ] logits (node: __module.lm_head/aten::linear/Add) : f32 / [...] / [1,95,32] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 493.97 ms + [ INFO ] Compile model took 492.85 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -630,17 +630,17 @@ models. [ INFO ] Fill input '45' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 120000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 69.21 ms + [ INFO ] First inference took 70.68 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 5430 iterations - [ INFO ] Duration: 120128.75 ms + [ INFO ] Count: 5424 iterations + [ INFO ] Duration: 120129.30 ms [ INFO ] Latency: - [ INFO ] Median: 130.74 ms - [ INFO ] Average: 132.58 ms - [ INFO ] Min: 66.32 ms - [ INFO ] Max: 307.29 ms - [ INFO ] Throughput: 45.20 FPS + [ INFO ] Median: 130.69 ms + [ INFO ] Average: 132.71 ms + [ INFO ] Min: 66.95 ms + [ INFO ] Max: 336.57 ms + [ INFO ] Throughput: 45.15 FPS .. code:: ipython3 @@ -667,7 +667,7 @@ models. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 25.21 ms + [ INFO ] Read model took 24.12 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [?,?] @@ -676,14 +676,14 @@ models. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: '45': [1,30480] - [ INFO ] Reshape model took 6.04 ms + [ INFO ] Reshape model took 6.07 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [1,30480] [ INFO ] Model outputs: [ INFO ] logits (node: __module.lm_head/aten::linear/Add) : f32 / [...] / [1,95,32] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1188.53 ms + [ INFO ] Compile model took 1216.49 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -720,15 +720,15 @@ models. [ INFO ] Fill input '45' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 120000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 55.48 ms + [ INFO ] First inference took 54.72 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 8046 iterations - [ INFO ] Duration: 120134.14 ms + [ INFO ] Count: 8016 iterations + [ INFO ] Duration: 120075.42 ms [ INFO ] Latency: - [ INFO ] Median: 88.11 ms - [ INFO ] Average: 89.43 ms - [ INFO ] Min: 71.74 ms - [ INFO ] Max: 270.18 ms - [ INFO ] Throughput: 66.98 FPS + [ INFO ] Median: 88.25 ms + [ INFO ] Average: 89.73 ms + [ INFO ] Min: 39.47 ms + [ INFO ] Max: 249.83 ms + [ INFO ] Throughput: 66.76 FPS diff --git a/docs/notebooks/speechbrain-emotion-recognition-with-output.rst b/docs/notebooks/speechbrain-emotion-recognition-with-output.rst index 23857ad92d4fa2..0f2b2a55f67169 100644 --- a/docs/notebooks/speechbrain-emotion-recognition-with-output.rst +++ b/docs/notebooks/speechbrain-emotion-recognition-with-output.rst @@ -63,9 +63,9 @@ Installations detectron2 0.6 requires iopath<0.1.10,>=0.1.7, but you have iopath 0.1.10 which is incompatible. mobileclip 0.1.0 requires torchvision==0.14.1, but you have torchvision 0.19.1+cpu which is incompatible. modelscope-studio 0.5.2 requires gradio<6.0,>=4.0, but you have gradio 3.43.1 which is incompatible. - parler-tts 0.2.1 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. - parler-tts 0.2.1 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. - pydantic 2.10.0 requires typing-extensions>=4.12.2, but you have typing-extensions 4.9.0 which is incompatible. + parler-tts 0.2.2 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. + parler-tts 0.2.2 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. + pydantic 2.10.3 requires typing-extensions>=4.12.2, but you have typing-extensions 4.9.0 which is incompatible. tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. @@ -95,7 +95,7 @@ Imports .. parsed-literal:: - INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling] + INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32] INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): [] @@ -135,8 +135,8 @@ SpeechBrain codebase. INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached INFO:speechbrain.utils.fetching:Fetch custom_interface.py: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached - 2024-11-22 05:15:27.494190: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:15:27.518517: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:40:05.072169: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:40:05.097896: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -148,7 +148,7 @@ SpeechBrain codebase. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/configuration_utils.py:306: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/configuration_utils.py:306: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`. warnings.warn( @@ -175,7 +175,7 @@ SpeechBrain codebase. INFO:speechbrain.utils.fetching:Fetch model.ckpt: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: wav2vec2, model, label_encoder - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/speechbrain/utils/checkpoints.py:200: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/speechbrain/utils/checkpoints.py:200: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. state_dict = torch.load(path, map_location=device) @@ -263,13 +263,19 @@ Step 2: Convert model to OpenVINO IR .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + +.. parsed-literal:: + + model.safetensors: 0%| | 0.00/380M [00:00 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: @@ -306,7 +306,7 @@ here, we always use fixed shapes in conversion by using an .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_stable_cascade.py:548: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_stable_cascade.py:548: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)): diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg index c26f6d2e4e6256..a09f1e5356f98d 100644 --- a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f1d0c9a1548ea1728d293d5e9864b85f3f438666fb647d5d98ce4a08bd9d494 -size 81141 +oid sha256:a7c11f26f9dc1eb3286c357bb147d12c812786f1796a8b62a5012075afe6de12 +size 83987 diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png index 8d36ff65c9eca3..eece770ac13fce 100644 --- a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a807ec75efd8572779d2c5de64bec882d23a29b52449e0a2df13fb67b527beae -size 1575960 +oid sha256:a32e4037dd5a34d227f3ef5a892121797617a3becd465227678a6ef6d7f8a090 +size 1608106 diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.jpg b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.jpg new file mode 100644 index 00000000000000..57b41a7f8d9bbe --- /dev/null +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb107211ea8c2d5b7f376c7896193df1b1b5c4b1ba4014e10734d5401848fada +size 92085 diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.png b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.png new file mode 100644 index 00000000000000..e718da40df51ae --- /dev/null +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b04c8ee319ce2d23bd4cf76666af100b2852bb0dd4ba558d978698f871f581 +size 1591012 diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst b/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst index 7f23c866161568..d5cbb62354f4fc 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst @@ -193,8 +193,8 @@ Additionally, LCM requires using LCMScheduler for efficient generation. .. parsed-literal:: - 2024-11-22 05:28:32.243878: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:28:32.268737: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:53:08.894939: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:53:08.920444: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -206,7 +206,7 @@ Additionally, LCM requires using LCMScheduler for efficient generation. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead. warnings.warn( @@ -288,10 +288,10 @@ extractor as input and returns image embeddings. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): @@ -353,17 +353,17 @@ Model predicts the ``sample`` state for the next step. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1111: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1111: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if dim % default_overall_up_factor != 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/embeddings.py:1801: FutureWarning: You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/embeddings.py:1801: FutureWarning: You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning. deprecate("image_embeds not a list", "1.0.0", deprecation_message, standard_warn=False) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:147: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:147: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if hidden_states.shape[0] >= 64: @@ -441,16 +441,16 @@ image in pipeline, we can discuss it in inference examples. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: - %2506 : Float(1, 4, 64, 64, strides=[16384, 4096, 64, 1], requires_grad=0, device=cpu) = aten::randn(%2500, %2501, %2502, %2503, %2504, %2505) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/torch_utils.py:81:0 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: + %2506 : Float(1, 4, 64, 64, strides=[16384, 4096, 64, 1], requires_grad=0, device=cpu) = aten::randn(%2500, %2501, %2502, %2503, %2504, %2505) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/torch_utils.py:81:0 This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace() _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: Tensor-likes are not close! - Mismatched elements: 10391 / 16384 (63.4%) - Greatest absolute difference: 0.000982522964477539 at index (0, 1, 0, 60) (up to 1e-05 allowed) - Greatest relative difference: 0.014704568038430557 at index (0, 3, 63, 59) (up to 1e-05 allowed) + Mismatched elements: 10463 / 16384 (63.9%) + Greatest absolute difference: 0.001137852668762207 at index (0, 2, 0, 6) (up to 1e-05 allowed) + Greatest relative difference: 0.006470232386295268 at index (0, 3, 63, 59) (up to 1e-05 allowed) _check_trace( @@ -496,9 +496,9 @@ hidden states. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if input_shape[-1] > 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png index c5cde5597bba55..475b4dd8ea40b4 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:541f65736f11c59692c577b1d85c0f6b9ab6ab51e8a9fdf6abf15063d06e2036 -size 965452 +oid sha256:f41e9dd669351422cfb30a6a5458431b699453f0934b43e199a0d4684dd4da85 +size 975310 diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png index 61d61f6001a527..ba0e885cf44c5a 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6162787bd52816b379097b9ec5284c2b65dc1178be5be7936240895f9de5285b -size 956477 +oid sha256:c5f0746a06f6d81be16e808107174009b68510b2e826885fe3f78021079b2a12 +size 945107 diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png index 937356ce2c1a55..baae1d818321e1 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9fe298508ac791d7d0901af522504878a7fd98675bad80cfd4b60cafd2a49c0 -size 592390 +oid sha256:76b9fd3519e90a6fa4b39a5749633ffc0031a5141f3698920b724205b304e9f3 +size 595645 diff --git a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst b/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst index e03a4ab614c769..a50a47392fb029 100644 --- a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst +++ b/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst @@ -114,15 +114,18 @@ script. It speeds up PyTorch code by JIT-compiling it into optimized kernels. By default, Torch code runs in eager-mode, but with the use of torch.compile it goes through the following steps: -1. Graph acquisition - the model is rewritten as blocks of subgraphs that are either: +1. Graph acquisition + - the model is rewritten as blocks of subgraphs that are either: - compiled by TorchDynamo and “flattened”, - - falling back to the eager-mode, due to unsupported Python constructs (like control-flow + - falling back to the + eager-mode, due to unsupported Python constructs (like control-flow code). 2. Graph lowering - all PyTorch operations are decomposed into their constituent kernels specific to the chosen backend. -3. Graph compilation - the kernels call their corresponding low-level +3. Graph + compilation - the kernels call their corresponding low-level device-specific operations. Select device for inference and enable or disable saving the optimized diff --git a/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst b/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst new file mode 100644 index 00000000000000..2eee517599af7c --- /dev/null +++ b/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst @@ -0,0 +1,562 @@ +Image generation with Torch.FX Stable Diffusion v3 and OpenVINO +=============================================================== + +Stable Diffusion V3 is next generation of latent diffusion image Stable +Diffusion models family that outperforms state-of-the-art text-to-image +generation systems in typography and prompt adherence, based on human +preference evaluations. In comparison with previous versions, it based +on Multimodal Diffusion Transformer (MMDiT) text-to-image model that +features greatly improved performance in image quality, typography, +complex prompt understanding, and resource-efficiency. + +.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/dd079427-89f2-4d28-a10e-c80792d750bf + :alt: mmdit.png + + mmdit.png + +More details about model can be found in `model +card `__, +`research +paper `__ +and `Stability.AI blog +post `__. In this +tutorial, we will demonstrate the optimize stable diffusion 3 in a Torch +FX representation using NNCF +`NNCF `__ for model +optimization. Additionally, we will accelerate the pipeline further by +running with torch.compile using the openvino backend. If you want to +run previous Stable Diffusion versions, please check our other +notebooks: + +- `Stable Diffusion `__ +- `Stable Diffusion v2 `__ +- `Stable Diffusion v3 `__ +- `Stable Diffusion XL `__ +- `LCM Stable + Diffusion `__ +- `Turbo SDXL `__ +- `Turbo SD `__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Build PyTorch pipeline <#build-pytorch-pipeline>`__ + + - `Store the Configs <#store-the-configs>`__ + +- `Run FP Inference <#run-fp-inference>`__ +- `Convert models to Torch FX <#convert-models-to-torch-fx>`__ +- `Quantization <#quantization>`__ + + - `Collect Calibration Dataset <#collect-calibration-dataset>`__ + - `Compress and Quantize models <#compress-and-quantize-models>`__ + - `Create Optimized Pipeline <#create-optimized-pipeline>`__ + - `Check File Size <#check-file-size>`__ + - `Optimized pipeline inference <#optimized-pipeline-inference>`__ + - `Visualize Results <#visualize-results>`__ + +- `Interactive demo <#interactive-demo>`__ + +Prerequisites +------------- + + + +.. code:: ipython3 + + %pip install -q "gradio>=4.19" "torch>=2.5" "torchvision>=0.20" "numpy<2.0" "transformers" "datasets>=2.14.6" "opencv-python" "pillow" "peft>=0.7.0" "diffusers>=0.31.0" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -qU "openvino>=2024.3.0" + %pip install -q "nncf>=2.14.0" "typing_extensions>=4.11" + +.. code:: ipython3 + + from pathlib import Path + + import requests + + if not Path("sd3_torch_fx_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/sd3_torch_fx_helper.py") + open("sd3_torch_fx_helper.py", "w").write(r.text) + + if not Path("gradio_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/gradio_helper.py") + open("gradio_helper.py", "w").write(r.text) + + if not Path("notebook_utils.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") + open("notebook_utils.py", "w").write(r.text) + +Build PyTorch pipeline +---------------------- + + + + **Note**: run model with notebook, you will need to accept license + agreement. You must be a registered user in Hugging Face Hub. + Please visit `HuggingFace model + card `__, + carefully read terms of usage and click accept button. You will need + to use an access token for the code below to run. For more + information on access tokens, refer to `this section of the + documentation `__. + You can login on Hugging Face Hub in notebook environment, using + following code: + +.. code:: ipython3 + + # uncomment these lines to login to huggingfacehub to get access to pretrained model + + # from huggingface_hub import notebook_login, whoami + + # try: + # whoami() + # print('Authorization token already provided') + # except OSError: + # notebook_login() + +.. code:: ipython3 + + from sd3_torch_fx_helper import get_sd3_pipeline, init_pipeline + + pipe = get_sd3_pipeline() + pipe.to("cpu") + +Store the Configs +~~~~~~~~~~~~~~~~~ + + + +This will be used later when wrapping the Torch FX models to insert back +into the pipeline + +.. code:: ipython3 + + configs_dict = {} + configs_dict["text_encoder"] = pipe.text_encoder.config + configs_dict["text_encoder_2"] = pipe.text_encoder_2.config + configs_dict["transformer"] = pipe.transformer.config + configs_dict["vae"] = pipe.vae.config + + pipe_config = pipe.config + +Run FP Inference +---------------- + + + +.. code:: ipython3 + + import numpy as np + import torch + + generator = torch.Generator(device="cpu").manual_seed(42) + prompt = "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors" + num_inference_steps = 28 + with torch.no_grad(): + image = pipe( + prompt=prompt, + negative_prompt="", + num_inference_steps=num_inference_steps, + generator=generator, + guidance_scale=5, + ).images[0] + image.resize( + ( + 512, + 512, + ) + ) + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget() + + device + +Convert models to Torch FX +-------------------------- + + + +This step converts the pytorch models in the hf pipeline to Torch FX +representation using the ``capture_pre_autograd()`` function. + +The pipeline consists of four important parts: + +- Clip and T5 Text Encoders to create condition to generate an image + from a text prompt. +- Transformer for step-by-step denoising latent image representation. +- Autoencoder (VAE) for decoding latent space to image. + +.. code:: ipython3 + + import torch + from nncf.torch.dynamic_graph.patch_pytorch import disable_patching + + text_encoder_input = torch.ones((1, 77), dtype=torch.long) + text_encoder_kwargs = {} + text_encoder_kwargs["output_hidden_states"] = True + + vae_encoder_input = torch.ones((1, 3, 128, 128)) + vae_decoder_input = torch.ones((1, 16, 128, 128)) + + unet_kwargs = {} + unet_kwargs["hidden_states"] = torch.ones((2, 16, 128, 128)) + unet_kwargs["timestep"] = torch.from_numpy(np.array([1, 2], dtype=np.float32)) + unet_kwargs["encoder_hidden_states"] = torch.ones((2, 154, 4096)) + unet_kwargs["pooled_projections"] = torch.ones((2, 2048)) + + with torch.no_grad(): + with disable_patching(): + text_encoder = torch.export.export_for_training( + pipe.text_encoder.eval(), + args=(text_encoder_input,), + kwargs=(text_encoder_kwargs), + ).module() + text_encoder_2 = torch.export.export_for_training( + pipe.text_encoder_2.eval(), + args=(text_encoder_input,), + kwargs=(text_encoder_kwargs), + ).module() + pipe.vae.decoder = torch.export.export_for_training(pipe.vae.decoder.eval(), args=(vae_decoder_input,)).module() + pipe.vae.encoder = torch.export.export_for_training(pipe.vae.encoder.eval(), args=(vae_encoder_input,)).module() + vae = pipe.vae + transformer = torch.export.export_for_training(pipe.transformer.eval(), args=(), kwargs=(unet_kwargs)).module() + models_dict = {} + models_dict["transformer"] = transformer + models_dict["vae"] = vae + models_dict["text_encoder"] = text_encoder + models_dict["text_encoder_2"] = text_encoder_2 + del unet_kwargs + del vae_encoder_input + del vae_decoder_input + del text_encoder_input + del text_encoder_kwargs + del pipe + +Quantization +------------ + + + +`NNCF `__ enables +post-training quantization by adding quantization layers into model +graph and then using a subset of the training dataset to initialize the +parameters of these additional quantization layers. Quantized operations +are executed in ``INT8`` instead of ``FP32``/``FP16`` making model +inference faster. + +According to ``StableDiffusion3Pipeline`` structure, the ``transformer`` +model takes up significant portion of the overall pipeline execution +time. Now we will show you how to optimize the transformer part using +`NNCF `__ to reduce +computation cost and speed up the pipeline. Quantizing the rest of the +pipeline does not significantly improve inference performance but can +lead to a substantial degradation of accuracy. That’s why we use 8-bit +weight compression for the rest of the pipeline to reduce memory +footprint. + +Please select below whether you would like to run quantization to +improve model inference speed. + + **NOTE**: Quantization is time and memory consuming operation. + Running quantization code below may take some time. + +.. code:: ipython3 + + from notebook_utils import quantization_widget + + to_quantize = quantization_widget() + + to_quantize + +Let’s load ``skip magic`` extension to skip quantization if +``to_quantize`` is not selected + +.. code:: ipython3 + + # Fetch `skip_kernel_extension` module + import requests + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", + ) + open("skip_kernel_extension.py", "w").write(r.text) + + %load_ext skip_kernel_extension + +Collect Calibration Dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + from typing import Any, Dict, List + + import datasets + from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel + from tqdm.notebook import tqdm + + + def disable_progress_bar(pipeline, disable=True): + if not hasattr(pipeline, "_progress_bar_config"): + pipeline._progress_bar_config = {"disable": disable} + else: + pipeline._progress_bar_config["disable"] = disable + + + class UNetWrapper(SD3Transformer2DModel): + def __init__(self, transformer, config): + super().__init__(**config) + self.transformer = transformer + self.captured_args = [] + + def forward(self, *args, **kwargs): + del kwargs["joint_attention_kwargs"] + del kwargs["return_dict"] + self.captured_args.append((*args, *tuple(kwargs.values()))) + return self.transformer(*args, **kwargs) + + + def collect_calibration_data( + pipe, calibration_dataset_size: int, num_inference_steps: int + ) -> List[Dict]: + + original_unet = pipe.transformer + calibration_data = [] + disable_progress_bar(pipe) + + dataset = datasets.load_dataset( + "google-research-datasets/conceptual_captions", + split="train", + trust_remote_code=True, + ).shuffle(seed=42) + + transformer_config = dict(pipe.transformer.config) + del transformer_config["model"] + wrapped_unet = UNetWrapper(pipe.transformer.model, transformer_config) + pipe.transformer = wrapped_unet + # Run inference for data collection + pbar = tqdm(total=calibration_dataset_size) + for i, batch in enumerate(dataset): + prompt = batch["caption"] + if len(prompt) > pipe.tokenizer.model_max_length: + continue + # Run the pipeline + pipe(prompt, num_inference_steps=num_inference_steps) + calibration_data.extend(wrapped_unet.captured_args) + wrapped_unet.captured_args = [] + pbar.update(len(calibration_data) - pbar.n) + if pbar.n >= calibration_dataset_size: + break + + disable_progress_bar(pipe, disable=False) + pipe.transformer = original_unet + return calibration_data + + + if to_quantize: + pipe = init_pipeline(models_dict, configs_dict) + calibration_dataset_size = 300 + unet_calibration_data = collect_calibration_data( + pipe, calibration_dataset_size=calibration_dataset_size, num_inference_steps=28 + ) + del pipe + +Compress and Quantize models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + import nncf + from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters + from nncf.quantization.range_estimator import RangeEstimatorParametersSet + + text_encoder = models_dict["text_encoder"] + text_encoder_2 = models_dict["text_encoder_2"] + vae_encoder = models_dict["vae"].encoder + vae_decoder = models_dict["vae"].decoder + original_transformer = models_dict["transformer"] + if to_quantize: + with disable_patching(): + with torch.no_grad(): + nncf.compress_weights(text_encoder) + nncf.compress_weights(text_encoder_2) + nncf.compress_weights(vae_encoder) + nncf.compress_weights(vae_decoder) + quantized_transformer = nncf.quantize( + model=original_transformer, + calibration_dataset=nncf.Dataset(unet_calibration_data), + subset_size=len(unet_calibration_data), + model_type=nncf.ModelType.TRANSFORMER, + ignored_scope=nncf.IgnoredScope(names=["conv2d"]), + advanced_parameters=nncf.AdvancedQuantizationParameters( + weights_range_estimator_params=RangeEstimatorParametersSet.MINMAX, + activations_range_estimator_params=RangeEstimatorParametersSet.MINMAX, + ), + ) + + optimized_models_dict = {} + optimized_models_dict["transformer"] = quantized_transformer + optimized_models_dict["vae"] = vae + optimized_models_dict["text_encoder"] = text_encoder + optimized_models_dict["text_encoder_2"] = text_encoder_2 + del models_dict + +.. code:: ipython3 + + %%skip not $to_quantize.value + import openvino.torch + + optimized_models_dict["text_encoder"] = torch.compile( + optimized_models_dict["text_encoder"], backend="openvino" + ) + optimized_models_dict["text_encoder_2"] = torch.compile( + optimized_models_dict["text_encoder_2"], backend="openvino" + ) + optimized_models_dict["vae"].encoder = torch.compile( + optimized_models_dict["vae"].encoder, backend="openvino" + ) + optimized_models_dict["vae"].decoder = torch.compile( + optimized_models_dict["vae"].decoder, backend="openvino" + ) + optimized_models_dict["transformer"] = torch.compile( + optimized_models_dict["transformer"], backend="openvino" + ) + +Create Optimized Pipeline +~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Initialize the optimized pipeline using the optimized models + +.. code:: ipython3 + + %%skip not $to_quantize.value + + opt_pipe = init_pipeline(optimized_models_dict, configs_dict) + +Check File Size +~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + + def get_model_size(models): + total_size = 0 + for model in models: + param_size = 0 + for param in model.parameters(): + param_size += param.nelement() * param.element_size() + buffer_size = 0 + for buffer in model.buffers(): + buffer_size += buffer.nelement() * buffer.element_size() + + model_size_mb = (param_size + buffer_size) / 1024**2 + + total_size += model_size_mb + return total_size + + + optimized_model_size = get_model_size([opt_pipe.transformer]) + original_model_size = get_model_size([original_transformer]) + + print(f"Original Transformer Size: {original_model_size} MB") + print(f"Optimized Transformer Size: {optimized_model_size} MB") + print(f"Compression Rate: {original_model_size / optimized_model_size:.3f}") + +Optimized pipeline inference +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Run inference with single step to compile the model. + +.. code:: ipython3 + + %%skip not $to_quantize.value + + # Warmup the model for initial compile + with torch.no_grad(): + image = opt_pipe( + prompt=prompt, negative_prompt="", num_inference_steps=1, generator=generator + ).images[0] + +Visualize Results +~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + from sd3_torch_fx_helper import visualize_results + + generator = torch.Generator(device="cpu").manual_seed(42) + opt_image = opt_pipe( + prompt, + negative_prompt="", + num_inference_steps=28, + guidance_scale=5, + generator=generator, + ).images[0] + + visualize_results(image, opt_image) + +Interactive demo +---------------- + + + +Please select below whether you would like to use the quantized models +to launch the interactive demo. + +.. code:: ipython3 + + use_quantized_models = quantization_widget() + + use_quantized_models + +.. code:: ipython3 + + from gradio_helper import make_demo + + fx_pipe = init_pipeline(models_dict if not to_quantize.value else optimized_models_dict, configs_dict) + demo = make_demo(fx_pipe, False) + + # if you are launching remotely, specify server_name and server_port + # demo.launch(server_name='your server name', server_port='server port in int') + # if you have any issue to launch on your platform, you can pass share=True to launch method: + # demo.launch(share=True) + # it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/ + try: + demo.launch(debug=True) + except Exception: + demo.launch(debug=True, share=True) diff --git a/docs/notebooks/stable-diffusion-xl-with-output.rst b/docs/notebooks/stable-diffusion-xl-with-output.rst index 54a43191c229a4..7ec1c0c81eeb20 100644 --- a/docs/notebooks/stable-diffusion-xl-with-output.rst +++ b/docs/notebooks/stable-diffusion-xl-with-output.rst @@ -100,9 +100,9 @@ Install prerequisites .. code:: ipython3 - # %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1" "torchvision" "diffusers>=0.24.0" "invisible-watermark>=0.2.0" "transformers>=4.33.0" "accelerate" "onnx!=1.16.2" "peft>=0.6.2" - # %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - # %pip install -q "openvino>=2023.1.0" "gradio>=4.19" "nncf>=2.9.0" + %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1" "torchvision" "diffusers>=0.24.0" "invisible-watermark>=0.2.0" "transformers>=4.33.0" "accelerate" "onnx!=1.16.2" "peft>=0.6.2" + %pip install -q "git+https://github.com/huggingface/optimum-intel.git" + %pip install -q "openvino>=2023.1.0" "gradio>=4.19" "nncf>=2.9.0" SDXL Base model --------------- diff --git a/docs/notebooks/style-transfer-with-output.rst b/docs/notebooks/style-transfer-with-output.rst index b123ca215cbbfc..c228604aee32f9 100644 --- a/docs/notebooks/style-transfer-with-output.rst +++ b/docs/notebooks/style-transfer-with-output.rst @@ -96,7 +96,7 @@ Install requirements .. parsed-literal:: - 24717 + 24624 @@ -186,14 +186,14 @@ OpenVINO Intermediate Representation (IR) with ``FP16`` precision. .. parsed-literal:: - model/mosaic-9.onnx: 0%| | 0.00/6.42M [00:00`__. It uses +`BiT-M-R50x1/1 `__ +model, which is trained on ImageNet-21k. Big Transfer is a recipe for +pre-training image classification models on large supervised datasets +and efficiently fine-tuning them on any given target task. The recipe +achieves excellent performance on a wide variety of tasks, even when +using very few labeled examples from the target dataset. This tutorial +uses OpenVINO backend for performing model quantization in NNCF. + + +**Table of contents:** + + +- `Prepare Dataset <#prepare-dataset>`__ +- `Plotting data samples <#plotting-data-samples>`__ +- `Model Fine-tuning <#model-fine-tuning>`__ +- `Perform model optimization (IR) + step <#perform-model-optimization-ir-step>`__ +- `Compute accuracy of the TF + model <#compute-accuracy-of-the-tf-model>`__ +- `Compute accuracy of the OpenVINO + model <#compute-accuracy-of-the-openvino-model>`__ +- `Quantize OpenVINO model using + NNCF <#quantize-openvino-model-using-nncf>`__ +- `Compute accuracy of the quantized + model <#compute-accuracy-of-the-quantized-model>`__ +- `Compare FP32 and INT8 accuracy <#compare-fp32-and-int8-accuracy>`__ +- `Compare inference results on one + picture <#compare-inference-results-on-one-picture>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +.. code:: ipython3 + + import platform + + %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 + %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 + %pip install -q "tensorflow>=2.5; sys_platform != 'darwin' and python_version > '3.8'" + + %pip install -q "openvino>=2024.0.0" "nncf>=2.7.0" "tensorflow-hub>=0.15.0" tf_keras + %pip install -q "scikit-learn>=1.3.2" + + if platform.system() != "Windows": + %pip install -q "matplotlib>=3.4" "tensorflow_datasets>=4.9.0" + else: + %pip install -q "matplotlib>=3.4" "tensorflow_datasets>=4.9.0,<4.9.3" + +.. code:: ipython3 + + import os + import numpy as np + from pathlib import Path + + from openvino.runtime import Core + import openvino as ov + import nncf + import logging + + from nncf.common.logging.logger import set_log_level + + set_log_level(logging.ERROR) + + from sklearn.metrics import accuracy_score + + os.environ["TF_USE_LEGACY_KERAS"] = "1" + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" + os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) + + import tensorflow as tf + import tensorflow_datasets as tfds + import tensorflow_hub as hub + + tfds.core.utils.gcs_utils._is_gcs_disabled = True + os.environ["NO_GCE_CHECK"] = "true" + + import requests + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + +.. code:: ipython3 + + core = Core() + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + + + # For top 5 labels. + MAX_PREDS = 1 + TRAINING_BATCH_SIZE = 128 + BATCH_SIZE = 1 + IMG_SIZE = (256, 256) # Default Imagenet image size + NUM_CLASSES = 10 # For Imagenette dataset + FINE_TUNING_STEPS = 1 + LR = 1e-5 + + MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) # From Imagenet dataset + STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) # From Imagenet dataset + +Prepare Dataset +~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + datasets, datasets_info = tfds.load( + "imagenette/160px", + shuffle_files=True, + as_supervised=True, + with_info=True, + read_config=tfds.ReadConfig(shuffle_seed=0), + ) + train_ds, validation_ds = datasets["train"], datasets["validation"] + +.. code:: ipython3 + + def preprocessing(image, label): + image = tf.image.resize(image, IMG_SIZE) + image = tf.cast(image, tf.float32) / 255.0 + label = tf.one_hot(label, NUM_CLASSES) + return image, label + + + train_dataset = train_ds.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(TRAINING_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) + validation_dataset = ( + validation_ds.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(TRAINING_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) + ) + +.. code:: ipython3 + + # Class labels dictionary with imagenette sample names and classes + lbl_dict = dict( + n01440764="tench", + n02102040="English springer", + n02979186="cassette player", + n03000684="chain saw", + n03028079="church", + n03394916="French horn", + n03417042="garbage truck", + n03425413="gas pump", + n03445777="golf ball", + n03888257="parachute", + ) + + # Imagenette samples name index + class_idx_dict = [ + "n01440764", + "n02102040", + "n02979186", + "n03000684", + "n03028079", + "n03394916", + "n03417042", + "n03425413", + "n03445777", + "n03888257", + ] + + + def label_func(key): + return lbl_dict[key] + +Plotting data samples +~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + import matplotlib.pyplot as plt + + # Get the class labels from the dataset info + class_labels = datasets_info.features["label"].names + + # Display labels along with the examples + num_examples_to_display = 4 + fig, axes = plt.subplots(nrows=1, ncols=num_examples_to_display, figsize=(10, 5)) + + for i, (image, label_index) in enumerate(train_ds.take(num_examples_to_display)): + label_name = class_labels[label_index.numpy()] + + axes[i].imshow(image.numpy()) + axes[i].set_title(f"{label_func(label_name)}") + axes[i].axis("off") + plt.tight_layout() + plt.show() + + +.. parsed-literal:: + + 2024-01-26 10:40:54.747316: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead. + + + +.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png + + +.. code:: ipython3 + + # Get the class labels from the dataset info + class_labels = datasets_info.features["label"].names + + # Display labels along with the examples + num_examples_to_display = 4 + fig, axes = plt.subplots(nrows=1, ncols=num_examples_to_display, figsize=(10, 5)) + + for i, (image, label_index) in enumerate(validation_ds.take(num_examples_to_display)): + label_name = class_labels[label_index.numpy()] + + axes[i].imshow(image.numpy()) + axes[i].set_title(f"{label_func(label_name)}") + axes[i].axis("off") + plt.tight_layout() + plt.show() + + +.. parsed-literal:: + + 2024-01-26 10:40:57.011386: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead. + + + +.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png + + +Model Fine-tuning +~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + # Load the Big Transfer model + bit_model_url = "https://www.kaggle.com/models/google/bit/frameworks/TensorFlow2/variations/m-r50x1/versions/1" + bit_m = hub.KerasLayer(bit_model_url, trainable=True) + + tf_model_dir = Path("bit_tf_model") + + # Customize the model for the new task + model = tf.keras.Sequential([bit_m, tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")]) + + # Compile the model + model.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=LR), + loss="categorical_crossentropy", + metrics=["accuracy"], + ) + + # Fine-tune the model + model.fit( + train_dataset.take(3000), + epochs=FINE_TUNING_STEPS, + validation_data=validation_dataset.take(1000), + ) + model.save(tf_model_dir, save_format="tf") + + +.. parsed-literal:: + + 101/101 [==============================] - 472s 4s/step - loss: 0.4904 - accuracy: 0.8806 - val_loss: 0.0810 - val_accuracy: 0.9840 + + +Perform model optimization (IR) step +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + ir_path = Path("bit_ov_model/bit_m_r50x1_1.xml") + if not ir_path.exists(): + print("Initiating model optimization..!!!") + ov_model = ov.convert_model("./bit_tf_model") + ov.save_model(ov_model, ir_path) + else: + print(f"IR model {ir_path} already exists.") + + +.. parsed-literal:: + + Initiating model optimization..!!! + + +Compute accuracy of the TF model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + tf_model = tf.keras.models.load_model(tf_model_dir) + + tf_predictions = [] + gt_label = [] + + for _, label in validation_dataset: + for cls_label in label: + l_list = cls_label.numpy().tolist() + gt_label.append(l_list.index(1)) + + for img_batch, label_batch in validation_dataset: + tf_result_batch = tf_model.predict(img_batch, verbose=0) + for i in range(len(img_batch)): + tf_result = tf_result_batch[i] + tf_result = tf.reshape(tf_result, [-1]) + top5_label_idx = np.argsort(tf_result)[-MAX_PREDS::][::-1] + tf_predictions.append(top5_label_idx) + + # Convert the lists to NumPy arrays for accuracy calculation + tf_predictions = np.array(tf_predictions) + gt_label = np.array(gt_label) + + tf_acc_score = accuracy_score(tf_predictions, gt_label) + + +.. parsed-literal:: + + 2024-01-26 10:51:24.539777: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 're_lu_48/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 2 outputs. Output shapes may be inaccurate. + 2024-01-26 10:51:24.539856: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'global_average_pooling2d/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 3 outputs. Output shapes may be inaccurate. + + +Compute accuracy of the OpenVINO model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Select device for inference: + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget() + + device + +.. code:: ipython3 + + core = ov.Core() + + ov_fp32_model = core.read_model(ir_path) + ov_fp32_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # Target device set to CPU (Other options Ex: AUTO/GPU/dGPU/) + compiled_model = ov.compile_model(ov_fp32_model, device.value) + output = compiled_model.outputs[0] + + ov_predictions = [] + for img_batch, _ in validation_dataset: + for image in img_batch: + image = tf.expand_dims(image, axis=0) + pred = compiled_model(image)[output] + ov_result = tf.reshape(pred, [-1]) + top_label_idx = np.argsort(ov_result)[-MAX_PREDS::][::-1] + ov_predictions.append(top_label_idx) + + fp32_acc_score = accuracy_score(ov_predictions, gt_label) + +Quantize OpenVINO model using NNCF +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Model Quantization using NNCF + +1. Preprocessing and preparing validation samples for NNCF calibration +2. Perform NNCF Quantization on OpenVINO FP32 model +3. Serialize Quantized OpenVINO INT8 model + +.. code:: ipython3 + + def nncf_preprocessing(image, label): + image = tf.image.resize(image, IMG_SIZE) + image = image - MEAN_RGB + image = image / STDDEV_RGB + return image + + + int8_ir_path = Path("bit_ov_int8_model/bit_m_r50x1_1_ov_int8.xml") + val_ds = validation_ds.map(nncf_preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(1).prefetch(tf.data.experimental.AUTOTUNE) + + calibration_dataset = nncf.Dataset(val_ds) + + ov_fp32_model = core.read_model(ir_path) + + ov_int8_model = nncf.quantize(ov_fp32_model, calibration_dataset, fast_bias_correction=False) + + ov.save_model(ov_int8_model, int8_ir_path) + + + +.. parsed-literal:: + + Output() + + + + + + + + + + + + + + + + + + +.. parsed-literal:: + + Output() + + + + + + + + + + + + + + + + + +Compute accuracy of the quantized model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + nncf_quantized_model = core.read_model(int8_ir_path) + nncf_quantized_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # Target device set to CPU by default + compiled_model = ov.compile_model(nncf_quantized_model, device.value) + output = compiled_model.outputs[0] + + ov_predictions = [] + inp_tensor = nncf_quantized_model.inputs[0] + out_tensor = nncf_quantized_model.outputs[0] + + for img_batch, _ in validation_dataset: + for image in img_batch: + image = tf.expand_dims(image, axis=0) + pred = compiled_model(image)[output] + ov_result = tf.reshape(pred, [-1]) + top_label_idx = np.argsort(ov_result)[-MAX_PREDS::][::-1] + ov_predictions.append(top_label_idx) + + int8_acc_score = accuracy_score(ov_predictions, gt_label) + +Compare FP32 and INT8 accuracy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + print(f"Accuracy of the tensorflow model (fp32): {tf_acc_score * 100: .2f}%") + print(f"Accuracy of the OpenVINO optimized model (fp32): {fp32_acc_score * 100: .2f}%") + print(f"Accuracy of the OpenVINO quantized model (int8): {int8_acc_score * 100: .2f}%") + accuracy_drop = fp32_acc_score - int8_acc_score + print(f"Accuracy drop between OV FP32 and INT8 model: {accuracy_drop * 100:.1f}% ") + + +.. parsed-literal:: + + Accuracy of the tensorflow model (fp32): 98.40% + Accuracy of the OpenVINO optimized model (fp32): 98.40% + Accuracy of the OpenVINO quantized model (int8): 98.00% + Accuracy drop between OV FP32 and INT8 model: 0.4% + + +Compare inference results on one picture +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + # Accessing validation sample + sample_idx = 50 + vds = datasets["validation"] + + if len(vds) > sample_idx: + sample = vds.take(sample_idx + 1).skip(sample_idx).as_numpy_iterator().next() + else: + print("Dataset does not have enough samples...!!!") + + # Image data + sample_data = sample[0] + + # Label info + sample_label = sample[1] + + # Image data pre-processing + image = tf.image.resize(sample_data, IMG_SIZE) + image = tf.expand_dims(image, axis=0) + image = tf.cast(image, tf.float32) / 255.0 + + + # OpenVINO inference + def ov_inference(model: ov.Model, image) -> str: + compiled_model = ov.compile_model(model, device.value) + output = compiled_model.outputs[0] + pred = compiled_model(image)[output] + ov_result = tf.reshape(pred, [-1]) + pred_label = np.argsort(ov_result)[-MAX_PREDS::][::-1] + return pred_label + + + # OpenVINO FP32 model + ov_fp32_model = core.read_model(ir_path) + ov_fp32_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # OpenVINO INT8 model + ov_int8_model = core.read_model(int8_ir_path) + ov_int8_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # OpenVINO FP32 model inference + ov_fp32_pred_label = ov_inference(ov_fp32_model, image) + + print(f"Predicted label for the sample picture by float (fp32) model: {label_func(class_idx_dict[int(ov_fp32_pred_label)])}\n") + + # OpenVINO FP32 model inference + ov_int8_pred_label = ov_inference(ov_int8_model, image) + print(f"Predicted label for the sample picture by qunatized (int8) model: {label_func(class_idx_dict[int(ov_int8_pred_label)])}\n") + + # Plotting the image sample with ground truth + plt.figure() + plt.imshow(sample_data) + plt.title(f"Ground truth: {label_func(class_idx_dict[sample_label])}") + plt.axis("off") + plt.show() + + +.. parsed-literal:: + + Predicted label for the sample picture by float (fp32) model: gas pump + + Predicted label for the sample picture by qunatized (int8) model: gas pump + + + + +.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png + diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png new file mode 100644 index 00000000000000..71aa7443a92cd8 --- /dev/null +++ b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b53b19fd375df2b53791482fa4f76ec9d376be865f1298f4ea5aa0acdb1f35 +size 224517 diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png new file mode 100644 index 00000000000000..38f050c05e472a --- /dev/null +++ b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048e8ff7b7ac7fa5f9cb66251d618f1ae941f26255f62c725d6223abd63e6fb7 +size 335047 diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png new file mode 100644 index 00000000000000..a8d02fcbd58c16 --- /dev/null +++ b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bf1b651f79891da47103dcc27259f890451c392325a712ff4c1b1cace7cb4be +size 296205 diff --git a/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst b/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst index 2e4f5ffe25369c..9ab3ae90d2fd3e 100644 --- a/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst +++ b/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst @@ -249,7 +249,7 @@ network. .. parsed-literal:: - data/coco.jpg: 0%| | 0.00/202k [00:00 + @@ -649,7 +649,7 @@ Zoo `__: .. parsed-literal:: - data/coco_91cl.txt: 0%| | 0.00/421 [00:00 + @@ -683,16 +678,10 @@ Zoo `__: -.. parsed-literal:: - - data/coco_91cl.txt: 0%| | 0.00/421 [00:00`__. -2. Run inference using the `Text to Image -pipeline `__ +2. Run inference using the `Text-to-Image Generation +pipeline `__ from OpenVINO GenAI. + **Table of contents:** + - `Prerequisites <#prerequisites>`__ - `Convert model using Optimum-CLI tool <#convert-model-using-optimum-cli-tool>`__ @@ -57,19 +59,19 @@ Prerequisites import platform import requests - - + + %pip install -q "git+https://github.com/huggingface/optimum-intel.git" %pip install -q -U "openvino>=2024.5" "openvino-tokenizers>=2024.5" "openvino-genai>=2024.5" %pip install -q Pillow "diffusers>=0.30.3" "gradio>=4.19" "typing_extensions>=4.9" if platform.system() == "Darwin": %pip install -q "numpy<2.0.0" - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) open("notebook_utils.py", "w").write(r.text) - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", ) @@ -81,7 +83,7 @@ Convert model using Optimum-CLI tool `Optimum Intel `__ -is the interface between the +is the interface between the `Transformers `__ and `Diffusers `__ libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. @@ -116,12 +118,12 @@ wrapper over cli-command. .. code:: ipython3 from pathlib import Path - + from cmd_helper import optimum_cli - - + + model_dir = Path("dreamlike_anime_1_0_ov") - + if not model_dir.exists(): optimum_cli("dreamlike-art/dreamlike-anime-1.0", model_dir) @@ -135,8 +137,8 @@ select device from dropdown list for running inference using OpenVINO .. code:: ipython3 from notebook_utils import device_widget - - + + device = device_widget("CPU", exclude=["NPU"]) device @@ -161,27 +163,27 @@ That’s it:) import openvino as ov from PIL import Image import torch - - + + class Generator(ov_genai.Generator): def __init__(self, seed): ov_genai.Generator.__init__(self) self.generator = torch.Generator(device="cpu").manual_seed(seed) - + def next(self): return torch.randn(1, generator=self.generator, dtype=torch.float32).item() - + def randn_tensor(self, shape: ov.Shape): torch_tensor = torch.randn(list(shape), generator=self.generator, dtype=torch.float32) return ov.Tensor(torch_tensor.numpy()) - - + + random_generator = Generator(42) # openvino_genai.CppStdGenerator can be used to have same images as C++ sample pipe = ov_genai.Text2ImagePipeline(model_dir, device.value) prompt = "anime, masterpiece, high quality, a green snowman with a happy smiling face in the snows" - + image_tensor = pipe.generate(prompt, width=512, height=512, num_inference_steps=20, num_images_per_prompt=1, generator=random_generator) - + image = Image.fromarray(image_tensor.data[0]) .. code:: ipython3 @@ -228,20 +230,20 @@ from command line: def prepare_adapter_config(adapters): adapter_config = ov_genai.AdapterConfig() - + # Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters: for i in range(int(len(adapters) / 2)): adapter = ov_genai.Adapter(adapters[2 * i]) alpha = float(adapters[2 * i + 1]) adapter_config.add(adapter, alpha) - + return adapter_config - - + + adapter_config = prepare_adapter_config(["soulcard.safetensors", 0.5]) - + pipe = ov_genai.Text2ImagePipeline(model_dir, device.value, adapters=adapter_config) - + image_tensor = pipe.generate(prompt, generator=Generator(42), width=512, height=512, num_inference_steps=20) image = Image.fromarray(image_tensor.data[0]) @@ -268,10 +270,10 @@ Interactive demo .. code:: ipython3 from gradio_helper import make_demo - - + + demo = make_demo(pipe, Generator, adapter_config) - + try: demo.launch(debug=True) except Exception: diff --git a/docs/notebooks/tflite-selfie-segmentation-with-output.rst b/docs/notebooks/tflite-selfie-segmentation-with-output.rst index 7f613016c47019..8691da62b77526 100644 --- a/docs/notebooks/tflite-selfie-segmentation-with-output.rst +++ b/docs/notebooks/tflite-selfie-segmentation-with-output.rst @@ -117,8 +117,7 @@ Download pretrained model and test image tflite_model_path = Path("selfie_multiclass_256x256.tflite") tflite_model_url = "https://storage.googleapis.com/mediapipe-models/image_segmenter/selfie_multiclass_256x256/float32/latest/selfie_multiclass_256x256.tflite" - if not tflite_model_path.exists(): - download_file(tflite_model_url, tflite_model_path) + download_file(tflite_model_url, tflite_model_path) @@ -127,6 +126,14 @@ Download pretrained model and test image selfie_multiclass_256x256.tflite: 0%| | 0.00/15.6M [00:00`__. .. code:: ipython3 - image = load_image("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bricks.png") + image = load_image("coco_bricks.png", "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bricks.png") # load_image reads the image in BGR format, [:,:,::-1] reshape transfroms it to RGB image = Image.fromarray(image[:, :, ::-1]) resized_image = image.resize((224, 224)) @@ -274,7 +274,7 @@ GPU. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 9.66 ms + [ INFO ] Read model took 9.35 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] images (node: images) : f32 / [...] / [1,224,224,3] @@ -288,7 +288,7 @@ GPU. [ INFO ] Model outputs: [ INFO ] Softmax (node: 61) : f32 / [...] / [1,1000] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 147.18 ms + [ INFO ] Compile model took 166.78 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: TensorFlow_Lite_Frontend_IR @@ -325,15 +325,15 @@ GPU. [ INFO ] Fill input 'images' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 7.26 ms + [ INFO ] First inference took 7.31 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 16578 iterations - [ INFO ] Duration: 15004.17 ms + [ INFO ] Count: 17460 iterations + [ INFO ] Duration: 15005.01 ms [ INFO ] Latency: - [ INFO ] Median: 5.30 ms - [ INFO ] Average: 5.29 ms - [ INFO ] Min: 2.92 ms - [ INFO ] Max: 17.62 ms - [ INFO ] Throughput: 1104.89 FPS + [ INFO ] Median: 4.99 ms + [ INFO ] Average: 5.02 ms + [ INFO ] Min: 2.99 ms + [ INFO ] Max: 17.05 ms + [ INFO ] Throughput: 1163.61 FPS diff --git a/docs/notebooks/tiny-sd-image-generation-with-output.rst b/docs/notebooks/tiny-sd-image-generation-with-output.rst index 2c4126b8aefc4c..090b8ff5f63378 100644 --- a/docs/notebooks/tiny-sd-image-generation-with-output.rst +++ b/docs/notebooks/tiny-sd-image-generation-with-output.rst @@ -96,9 +96,9 @@ First, load the pre-trained weights of all components of the model. import gc from diffusers import StableDiffusionPipeline - + model_id = "segmind/tiny-sd" - + pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cpu") text_encoder = pipe.text_encoder text_encoder.eval() @@ -106,7 +106,7 @@ First, load the pre-trained weights of all components of the model. unet.eval() vae = pipe.vae vae.eval() - + del pipe gc.collect() @@ -164,10 +164,10 @@ hidden states. from pathlib import Path import torch import openvino as ov - + TEXT_ENCODER_OV_PATH = Path("text_encoder.xml") - - + + def convert_encoder(text_encoder: torch.nn.Module, ir_path: Path): """ Convert Text Encoder mode. @@ -181,7 +181,7 @@ hidden states. input_ids = torch.ones((1, 77), dtype=torch.long) # switch model to inference mode text_encoder.eval() - + # disable gradients calculation for reducing memory consumption with torch.no_grad(): # Export model to IR format @@ -195,13 +195,13 @@ hidden states. ov.save_model(ov_model, ir_path) del ov_model print(f"Text Encoder successfully converted to IR and saved to {ir_path}") - - + + if not TEXT_ENCODER_OV_PATH.exists(): convert_encoder(text_encoder, TEXT_ENCODER_OV_PATH) else: print(f"Text encoder will be loaded from {TEXT_ENCODER_OV_PATH}") - + del text_encoder gc.collect(); @@ -223,12 +223,12 @@ Model predicts the ``sample`` state for the next step. import numpy as np from openvino import PartialShape, Type - + UNET_OV_PATH = Path("unet.xml") - + dtype_mapping = {torch.float32: Type.f32, torch.float64: Type.f64} - - + + def convert_unet(unet: torch.nn.Module, ir_path: Path): """ Convert U-net model to IR format. @@ -250,15 +250,15 @@ Model predicts the ``sample`` state for the next step. shape = PartialShape(tuple(input_tensor.shape)) element_type = dtype_mapping[input_tensor.dtype] input_info.append((shape, element_type)) - + unet.eval() with torch.no_grad(): ov_model = ov.convert_model(unet, example_input=dummy_inputs, input=input_info) ov.save_model(ov_model, ir_path) del ov_model print(f"Unet successfully converted to IR and saved to {ir_path}") - - + + if not UNET_OV_PATH.exists(): convert_unet(unet, UNET_OV_PATH) gc.collect() @@ -292,8 +292,8 @@ of the pipeline, it will be better to convert them to separate models. .. code:: ipython3 VAE_ENCODER_OV_PATH = Path("vae_encodr.xml") - - + + def convert_vae_encoder(vae: torch.nn.Module, ir_path: Path): """ Convert VAE model for encoding to IR format. @@ -305,15 +305,15 @@ of the pipeline, it will be better to convert them to separate models. Returns: None """ - + class VAEEncoderWrapper(torch.nn.Module): def __init__(self, vae): super().__init__() self.vae = vae - + def forward(self, image): return self.vae.encode(x=image)["latent_dist"].sample() - + vae_encoder = VAEEncoderWrapper(vae) vae_encoder.eval() image = torch.zeros((1, 3, 512, 512)) @@ -322,16 +322,16 @@ of the pipeline, it will be better to convert them to separate models. ov.save_model(ov_model, ir_path) del ov_model print(f"VAE encoder successfully converted to IR and saved to {ir_path}") - - + + if not VAE_ENCODER_OV_PATH.exists(): convert_vae_encoder(vae, VAE_ENCODER_OV_PATH) else: print(f"VAE encoder will be loaded from {VAE_ENCODER_OV_PATH}") - + VAE_DECODER_OV_PATH = Path("vae_decoder.xml") - - + + def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path): """ Convert VAE model for decoding to IR format. @@ -343,31 +343,31 @@ of the pipeline, it will be better to convert them to separate models. Returns: None """ - + class VAEDecoderWrapper(torch.nn.Module): def __init__(self, vae): super().__init__() self.vae = vae - + def forward(self, latents): return self.vae.decode(latents) - + vae_decoder = VAEDecoderWrapper(vae) latents = torch.zeros((1, 4, 64, 64)) - + vae_decoder.eval() with torch.no_grad(): ov_model = ov.convert_model(vae_decoder, example_input=latents, input=[((1, 4, 64, 64),)]) ov.save_model(ov_model, ir_path) del ov_model print(f"VAE decoder successfully converted to IR and saved to {ir_path}") - - + + if not VAE_DECODER_OV_PATH.exists(): convert_vae_decoder(vae, VAE_DECODER_OV_PATH) else: print(f"VAE decoder will be loaded from {VAE_DECODER_OV_PATH}") - + del vae gc.collect(); @@ -426,20 +426,20 @@ of the variational auto encoder. import inspect from typing import List, Optional, Union, Dict - + import PIL import cv2 - + from transformers import CLIPTokenizer from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler - - + + def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): """ Preprocessing helper function for calculating image size for resize with peserving original aspect ratio and fitting image to specific window size - + Parameters: dst_width (int): destination window width dst_height (int): destination window height @@ -451,15 +451,15 @@ of the variational auto encoder. """ im_scale = min(dst_height / image_height, dst_width / image_width) return int(im_scale * image_width), int(im_scale * image_height) - - + + def preprocess(image: PIL.Image.Image): """ Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - + Parameters: image (PIL.Image.Image): input image Returns: @@ -477,8 +477,8 @@ of the variational auto encoder. image = 2.0 * image - 1.0 image = image.transpose(0, 3, 1, 2) return image, {"padding": pad, "src_width": src_width, "src_height": src_height} - - + + class OVStableDiffusionPipeline(DiffusionPipeline): def __init__( self, @@ -518,7 +518,7 @@ of the variational auto encoder. self.height = 512 self.width = 512 self.tokenizer = tokenizer - + def __call__( self, prompt: Union[str, List[str]], @@ -567,7 +567,7 @@ of the variational auto encoder. """ if seed is not None: np.random.seed(seed) - + img_buffer = [] do_classifier_free_guidance = guidance_scale > 1.0 # get prompt text embeddings @@ -576,20 +576,20 @@ of the variational auto encoder. do_classifier_free_guidance=do_classifier_free_guidance, negative_prompt=negative_prompt, ) - + # set timesteps accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) extra_set_kwargs = {} if accepts_offset: extra_set_kwargs["offset"] = 1 - + self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) latent_timestep = timesteps[:1] - + # get the initial random noise unless the user supplied it latents, meta = self.prepare_latents(image, latent_timestep) - + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 @@ -598,19 +598,19 @@ of the variational auto encoder. extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta - + for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if you are doing classifier free guidance latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - + # predict the noise residual noise_pred = self.unet([latent_model_input, t, text_embeddings])[self._unet_output] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step( torch.from_numpy(noise_pred), @@ -622,13 +622,13 @@ of the variational auto encoder. image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] image = self.postprocess_image(image, meta, output_type) img_buffer.extend(image) - + # scale and decode the image latents with vae image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] - + image = self.postprocess_image(image, meta, output_type) return {"sample": image, "iterations": img_buffer} - + def _encode_prompt( self, prompt: Union[str, List[str]], @@ -638,7 +638,7 @@ of the variational auto encoder. ): """ Encodes the prompt into text encoder hidden states. - + Parameters: prompt (str or list(str)): prompt to be encoded num_images_per_prompt (int): number of images that should be generated per prompt @@ -648,7 +648,7 @@ of the variational auto encoder. text_embeddings (np.ndarray): text encoder hidden states """ batch_size = len(prompt) if isinstance(prompt, list) else 1 - + # tokenize input prompts text_inputs = self.tokenizer( prompt, @@ -658,15 +658,15 @@ of the variational auto encoder. return_tensors="np", ) text_input_ids = text_inputs.input_ids - + text_embeddings = self.text_encoder(text_input_ids)[self._text_encoder_output] - + # duplicate text embeddings for each generation per prompt if num_images_per_prompt != 1: bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - + # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: uncond_tokens: List[str] @@ -684,25 +684,25 @@ of the variational auto encoder. truncation=True, return_tensors="np", ) - + uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self._text_encoder_output] - + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = uncond_embeddings.shape[1] uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - + # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - + return text_embeddings - + def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None): """ Function for getting initial latents for starting generation - + Parameters: image (PIL.Image.Image, *optional*, None): Input image for generation, if not provided randon noise will be used as starting point @@ -723,12 +723,12 @@ of the variational auto encoder. latents = self.vae_encoder(input_image)[self._vae_e_output] * 0.18215 latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() return latents, meta - + def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"): """ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required), normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format - + Parameters: image (np.ndarray): Generated image @@ -760,12 +760,12 @@ of the variational auto encoder. orig_height, orig_width = meta["src_height"], meta["src_width"] image = [cv2.resize(img, (orig_width, orig_width)) for img in image] return image - + def get_timesteps(self, num_inference_steps: int, strength: float): """ Helper function for getting scheduler timesteps for generation In case of image-to-image generation, it updates number of steps according to strength - + Parameters: num_inference_steps (int): number of inference steps for generation @@ -775,10 +775,10 @@ of the variational auto encoder. """ # get the original timestep using init_timestep init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - + t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] - + return timesteps, num_inference_steps - t_start Configure Inference Pipeline @@ -797,16 +797,16 @@ Select device from dropdown list for running inference using OpenVINO. .. code:: ipython3 import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) open("notebook_utils.py", "w").write(r.text) - + from notebook_utils import device_widget - + device = device_widget() - + device @@ -821,7 +821,7 @@ Select device from dropdown list for running inference using OpenVINO. .. code:: ipython3 core = ov.Core() - + text_enc = core.compile_model(TEXT_ENCODER_OV_PATH, device.value) Calibrate UNet for GPU inference @@ -839,28 +839,28 @@ operations to be executed in full precision. import pickle import requests import os - + # Fetch `model_upcast_utils` which helps to restore accuracy when inferred on GPU r = requests.get("https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/model_upcast_utils.py") with open("model_upcast_utils.py", "w") as f: f.write(r.text) - + # Fetch an example input for UNet model needed for upcasting calibration process r = requests.get("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/pkl/unet_calibration_example_input.pkl") with open("unet_calibration_example_input.pkl", "wb") as f: f.write(r.content) - + from model_upcast_utils import ( is_model_partially_upcasted, partially_upcast_nodes_to_fp32, ) - + unet_model = core.read_model(UNET_OV_PATH) if "GPU" in core.available_devices and not is_model_partially_upcasted(unet_model): with open("unet_calibration_example_input.pkl", "rb") as f: example_input = pickle.load(f) unet_model = partially_upcast_nodes_to_fp32(unet_model, example_input, upcast_ratio=0.7, operation_types=["Convolution"]) - + ov.save_model(unet_model, UNET_OV_PATH.with_suffix("._tmp.xml")) del unet_model os.remove(UNET_OV_PATH) @@ -875,7 +875,7 @@ operations to be executed in full precision. .. code:: ipython3 ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device.value != "CPU" else {} - + vae_decoder = core.compile_model(VAE_DECODER_OV_PATH, device.value, ov_config) vae_encoder = core.compile_model(VAE_ENCODER_OV_PATH, device.value, ov_config) @@ -886,10 +886,10 @@ Let us define them and put all components together from transformers import CLIPTokenizer from diffusers.schedulers import LMSDiscreteScheduler - + lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - + ov_pipe = OVStableDiffusionPipeline( tokenizer=tokenizer, text_encoder=text_enc, @@ -923,7 +923,7 @@ Now, let’s see model in action .. parsed-literal:: Pipeline settings - Input text: RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors + Input text: RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors Seed: 431 Number of steps: 20 @@ -961,7 +961,7 @@ Now is show time! .. parsed-literal:: Input text: - RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors + RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors @@ -1014,10 +1014,11 @@ found in this .. code:: ipython3 from diffusers.utils import load_image - + default_image_url = "https://user-images.githubusercontent.com/29454499/260418860-69cc443a-9ee6-493c-a393-3a97af080be7.jpg" + image_name = "default.jpg" # read uploaded image - image = load_image(default_image_url) + image = load_image(image_name, default_image_url) print("Pipeline settings") print(f"Input positive prompt: \n\t{text_prompt_i2i}") print(f"Input negative prompt: \n\t{negative_prompt_i2i}") @@ -1039,9 +1040,9 @@ found in this .. parsed-literal:: Pipeline settings - Input positive prompt: + Input positive prompt: professional photo portrait of woman, highly detailed, hyper realistic, cinematic effects, soft lighting - Input negative prompt: + Input negative prompt: blurry, poor quality, low res, worst quality, cropped, ugly, poorly drawn face, without eyes, mutation, unreal, animate, poorly drawn eyes Seed: 82698152 Number of steps: 40 @@ -1090,13 +1091,13 @@ Interactive Demo .. code:: ipython3 import gradio as gr - - + + def generate_from_text(text, negative_text, seed, num_steps, _=gr.Progress(track_tqdm=True)): result = ov_pipe(text, negative_prompt=negative_text, num_inference_steps=num_steps, seed=seed) return result["sample"][0] - - + + def generate_from_image(img, text, negative_text, seed, num_steps, strength, _=gr.Progress(track_tqdm=True)): result = ov_pipe( text, @@ -1113,11 +1114,11 @@ Interactive Demo if not Path("gradio_helper.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/tiny-sd-image-generation/gradio_helper.py") open("gradio_helper.py", "w").write(r.text) - + from gradio_helper import make_demo - + demo = make_demo(text_to_text_fn=generate_from_text, image_to_image_fn=generate_from_image) - + try: demo.queue().launch(debug=False) except Exception: diff --git a/docs/notebooks/vehicle-detection-and-recognition-with-output.rst b/docs/notebooks/vehicle-detection-and-recognition-with-output.rst index 30204d6bd11ded..fab72fb0725d03 100644 --- a/docs/notebooks/vehicle-detection-and-recognition-with-output.rst +++ b/docs/notebooks/vehicle-detection-and-recognition-with-output.rst @@ -152,25 +152,25 @@ model is already downloaded, this step is skipped. .. parsed-literal:: - model/vehicle-detection-0200.xml: 0%| | 0.00/181k [00:00 - 100%|██████████| 4.68M/4.68M [00:00<00:00, 34.1MB/s] + 100%|██████████| 4.68M/4.68M [00:00<00:00, 17.2MB/s] .. parsed-literal:: @@ -215,13 +215,13 @@ next cell loads the model and the pre-trained weights. .. parsed-literal:: - Loading model weights from: 'model/u2net_lite/u2net_lite.pth' + /tmp/ipykernel_2254056/1036642300.py:7: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + net.load_state_dict(state_dict=torch.load(model_path, map_location="cpu")) .. parsed-literal:: - /tmp/ipykernel_3590581/1036642300.py:7: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - net.load_state_dict(state_dict=torch.load(model_path, map_location="cpu")) + Loading model weights from: 'model/u2net_lite/u2net_lite.pth' @@ -247,7 +247,7 @@ OpenVINO IR format. Executing the following command may take a while. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/vision-background-removal/model/u2net.py:23: UserWarning: `nn.functional.upsample` is deprecated. Use `nn.functional.interpolate` instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/vision-background-removal/model/u2net.py:23: UserWarning: `nn.functional.upsample` is deprecated. Use `nn.functional.interpolate` instead. src = F.upsample(src,size=tar.shape[2:],mode='bilinear') @@ -273,12 +273,13 @@ repository `__ and multiplied by .. code:: ipython3 IMAGE_URI = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_hollywood.jpg" + IMAGE_NAME = "coco_hollywood.jpg" input_mean = np.array([123.675, 116.28, 103.53]).reshape(1, 3, 1, 1) input_scale = np.array([58.395, 57.12, 57.375]).reshape(1, 3, 1, 1) image = cv2.cvtColor( - src=load_image(IMAGE_URI), + src=load_image(IMAGE_NAME, IMAGE_URI), code=cv2.COLOR_BGR2RGB, ) @@ -336,7 +337,7 @@ Load the OpenVINO IR model to OpenVINO Runtime and do inference. .. parsed-literal:: - Inference finished. Inference time: 0.107 seconds, FPS: 9.35. + Inference finished. Inference time: 0.109 seconds, FPS: 9.19. Visualize Results @@ -389,12 +390,13 @@ background pixels a value of 0. Replace the background image as follows: .. code:: ipython3 - BACKGROUND_FILE = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/wall.jpg" + BACKGROUND_IMAGE_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/wall.jpg" + BACKGROUND_IMAGE_NAME = "wall.jpg" OUTPUT_DIR = "output" os.makedirs(name=OUTPUT_DIR, exist_ok=True) - background_image = cv2.cvtColor(src=load_image(BACKGROUND_FILE), code=cv2.COLOR_BGR2RGB) + background_image = cv2.cvtColor(src=load_image(BACKGROUND_IMAGE_NAME, BACKGROUND_IMAGE_URL), code=cv2.COLOR_BGR2RGB) background_image = cv2.resize(src=background_image, dsize=(image.shape[1], image.shape[0])) # Set all the foreground pixels from the result to 0 @@ -403,7 +405,7 @@ background pixels a value of 0. Replace the background image as follows: new_image = background_image + bg_removed_result # Save the generated image. - new_image_path = Path(f"{OUTPUT_DIR}/{Path(IMAGE_URI).stem}-{Path(BACKGROUND_FILE).stem}.jpg") + new_image_path = Path(f"{OUTPUT_DIR}/{Path(IMAGE_URI).stem}-{BACKGROUND_IMAGE_NAME}") cv2.imwrite(filename=str(new_image_path), img=cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)) # Display the original image and the image with the new background side by side diff --git a/docs/notebooks/vision-monodepth-with-output.rst b/docs/notebooks/vision-monodepth-with-output.rst index 736e0238989f8d..cf232c30fc9c73 100644 --- a/docs/notebooks/vision-monodepth-with-output.rst +++ b/docs/notebooks/vision-monodepth-with-output.rst @@ -146,11 +146,8 @@ format. ir_model_name_xml = "MiDaS_small.xml" ir_model_name_bin = "MiDaS_small.bin" - - if not (model_folder / ir_model_name_xml).exists(): - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory=model_folder) - if not (model_folder / ir_model_name_bin).exists(): - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory=model_folder) + download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory=model_folder) + download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory=model_folder) model_xml_path = model_folder / ir_model_name_xml @@ -158,13 +155,13 @@ format. .. parsed-literal:: - model/MiDaS_small.xml: 0%| | 0.00/268k [00:00 #0:0 (mpeg4 (native) -> h264 (libx264)) Stream #0:0 -> #0:1 (pcm_s16le (native) -> aac (native)) Press [q] to stop, [?] for help - [libx264 @ 0x556392e25840] -qscale is ignored, -crf is recommended. - [libx264 @ 0x556392e25840] using SAR=1/1 - [libx264 @ 0x556392e25840] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2 AVX512 - [libx264 @ 0x556392e25840] profile High, level 3.1 - [libx264 @ 0x556392e25840] 264 - core 155 r2917 0a84d98 - H.264/MPEG-4 AVC codec - Copyleft 2003-2018 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=24 lookahead_threads=4 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00 + [libx264 @ 0x55ec6513e840] -qscale is ignored, -crf is recommended. + [libx264 @ 0x55ec6513e840] using SAR=1/1 + [libx264 @ 0x55ec6513e840] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2 AVX512 + [libx264 @ 0x55ec6513e840] profile High, level 3.1 + [libx264 @ 0x55ec6513e840] 264 - core 155 r2917 0a84d98 - H.264/MPEG-4 AVC codec - Copyleft 2003-2018 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=24 lookahead_threads=4 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00 Output #0, mp4, to 'results/result_voice.mp4': Metadata: encoder : Lavf58.29.100 @@ -349,27 +349,27 @@ python API and converted OpenVINO models. Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, mono, fltp, 69 kb/s Metadata: encoder : Lavc58.54.100 aac - frame= 123 fps=0.0 q=-1.0 Lsize= 621kB time=00:00:05.06 bitrate=1005.8kbits/s speed=9.73x + frame= 123 fps=0.0 q=-1.0 Lsize= 621kB time=00:00:05.06 bitrate=1005.8kbits/s speed=10.6x video:573kB audio:43kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.827166% - [libx264 @ 0x556392e25840] frame I:1 Avg QP:22.24 size: 31028 - [libx264 @ 0x556392e25840] frame P:75 Avg QP:22.01 size: 6954 - [libx264 @ 0x556392e25840] frame B:47 Avg QP:25.58 size: 718 - [libx264 @ 0x556392e25840] consecutive B-frames: 38.2% 27.6% 14.6% 19.5% - [libx264 @ 0x556392e25840] mb I I16..4: 14.0% 83.9% 2.1% - [libx264 @ 0x556392e25840] mb P I16..4: 1.3% 3.3% 0.1% P16..4: 37.8% 8.2% 6.4% 0.0% 0.0% skip:43.0% - [libx264 @ 0x556392e25840] mb B I16..4: 0.2% 0.7% 0.0% B16..8: 27.9% 0.4% 0.1% direct: 0.2% skip:70.6% L0:43.9% L1:54.2% BI: 1.9% - [libx264 @ 0x556392e25840] 8x8 transform intra:73.3% inter:77.1% - [libx264 @ 0x556392e25840] coded y,uvDC,uvAC intra: 56.9% 72.4% 8.1% inter: 11.4% 13.0% 0.2% - [libx264 @ 0x556392e25840] i16 v,h,dc,p: 20% 23% 9% 48% - [libx264 @ 0x556392e25840] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 25% 23% 36% 3% 3% 2% 2% 3% 3% - [libx264 @ 0x556392e25840] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 39% 14% 14% 4% 6% 7% 4% 9% 3% - [libx264 @ 0x556392e25840] i8c dc,h,v,p: 42% 25% 29% 4% - [libx264 @ 0x556392e25840] Weighted P-Frames: Y:0.0% UV:0.0% - [libx264 @ 0x556392e25840] ref P L0: 74.2% 10.4% 11.1% 4.3% - [libx264 @ 0x556392e25840] ref B L0: 86.1% 11.2% 2.8% - [libx264 @ 0x556392e25840] ref B L1: 98.3% 1.7% - [libx264 @ 0x556392e25840] kb/s:953.36 - [aac @ 0x556392e27140] Qavg: 121.673 + [libx264 @ 0x55ec6513e840] frame I:1 Avg QP:22.24 size: 31028 + [libx264 @ 0x55ec6513e840] frame P:75 Avg QP:22.01 size: 6954 + [libx264 @ 0x55ec6513e840] frame B:47 Avg QP:25.58 size: 718 + [libx264 @ 0x55ec6513e840] consecutive B-frames: 38.2% 27.6% 14.6% 19.5% + [libx264 @ 0x55ec6513e840] mb I I16..4: 14.0% 83.9% 2.1% + [libx264 @ 0x55ec6513e840] mb P I16..4: 1.3% 3.3% 0.1% P16..4: 37.8% 8.2% 6.4% 0.0% 0.0% skip:43.0% + [libx264 @ 0x55ec6513e840] mb B I16..4: 0.2% 0.7% 0.0% B16..8: 27.9% 0.4% 0.1% direct: 0.2% skip:70.6% L0:43.9% L1:54.2% BI: 1.9% + [libx264 @ 0x55ec6513e840] 8x8 transform intra:73.3% inter:77.1% + [libx264 @ 0x55ec6513e840] coded y,uvDC,uvAC intra: 56.9% 72.4% 8.1% inter: 11.4% 13.0% 0.2% + [libx264 @ 0x55ec6513e840] i16 v,h,dc,p: 20% 23% 9% 48% + [libx264 @ 0x55ec6513e840] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 25% 23% 36% 3% 3% 2% 2% 3% 3% + [libx264 @ 0x55ec6513e840] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 39% 14% 14% 4% 6% 7% 4% 9% 3% + [libx264 @ 0x55ec6513e840] i8c dc,h,v,p: 42% 25% 29% 4% + [libx264 @ 0x55ec6513e840] Weighted P-Frames: Y:0.0% UV:0.0% + [libx264 @ 0x55ec6513e840] ref P L0: 74.2% 10.4% 11.1% 4.3% + [libx264 @ 0x55ec6513e840] ref B L0: 86.1% 11.2% 2.8% + [libx264 @ 0x55ec6513e840] ref B L1: 98.3% 1.7% + [libx264 @ 0x55ec6513e840] kb/s:953.36 + [aac @ 0x55ec65140140] Qavg: 121.673 diff --git a/docs/notebooks/whisper-subtitles-generation-with-output.rst b/docs/notebooks/whisper-subtitles-generation-with-output.rst index 5a3c677fc27daa..a2764b4622bf67 100644 --- a/docs/notebooks/whisper-subtitles-generation-with-output.rst +++ b/docs/notebooks/whisper-subtitles-generation-with-output.rst @@ -79,15 +79,27 @@ Install dependencies. .. code:: ipython3 import platform + import importlib.metadata + import importlib.util %pip install -q "nncf>=2.14.0" %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" %pip install -q "python-ffmpeg<=1.0.16" "ffmpeg" "moviepy" "transformers>=4.45" "git+https://github.com/huggingface/optimum-intel.git" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "yt_dlp>=2024.8.6" soundfile librosa jiwer + %pip install -q -U "yt_dlp>=2024.8.6" soundfile librosa jiwer packaging %pip install -q "gradio>=4.19" "typing_extensions>=4.9" if platform.system() == "Darwin": %pip install -q "numpy<2.0" + + + from packaging import version + + if ( + importlib.util.find_spec("tensorflow") is not None + and version.parse(importlib.metadata.version("tensorflow")) < version.parse("2.18.0") + and version.parse(importlib.metadata.version("numpy")) >= version.parse("2.0.0") + ): + %pip uninstall -q -y tensorflow .. code:: ipython3 @@ -312,7 +324,10 @@ Select the task for the model: .. code:: ipython3 - from moviepy.editor import VideoFileClip + try: + from moviepy import VideoFileClip + except ImportError: + from moviepy.editor import VideoFileClip from transformers.pipelines.audio_utils import ffmpeg_read diff --git a/docs/notebooks/yolov10-optimization-with-output.rst b/docs/notebooks/yolov10-optimization-with-output.rst index dd7106d7a6bb56..99674518b41948 100644 --- a/docs/notebooks/yolov10-optimization-with-output.rst +++ b/docs/notebooks/yolov10-optimization-with-output.rst @@ -107,17 +107,6 @@ Prerequisites %pip install -q "git+https://github.com/THU-MIG/yolov10.git" --extra-index-url https://download.pytorch.org/whl/cpu %pip install -q "torch>=2.1" "torchvision>=0.16" tqdm opencv-python "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu - -.. parsed-literal:: - - WARNING: Skipping openvino as it is not installed. - WARNING: Skipping openvino-dev as it is not installed. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - .. code:: ipython3 from pathlib import Path diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output.rst b/docs/notebooks/yolov11-instance-segmentation-with-output.rst index 6c71d614e519db..0c346bf08cf3a6 100644 --- a/docs/notebooks/yolov11-instance-segmentation-with-output.rst +++ b/docs/notebooks/yolov11-instance-segmentation-with-output.rst @@ -128,18 +128,25 @@ Import required utility functions. The lower cell will download the # Download a test sample IMAGE_PATH = Path("./data/coco_bike.jpg") - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) + download_file( + url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", + filename=IMAGE_PATH.name, + directory=IMAGE_PATH.parent, + ) + + + +.. parsed-literal:: + + coco_bike.jpg: 0%| | 0.00/182k [00:00=2023.3.0" "nncf>=2.8.1" "opencv-python" "matplotlib>=3.4" "seaborn" "pandas" "scikit-learn" "torch" "torchvision" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu @@ -97,7 +98,7 @@ Prerequisites .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/yolov9-optimization/yolov9 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/yolov9-optimization/yolov9 Get PyTorch model @@ -128,14 +129,14 @@ applicable for other models from YOLO V9 family. .. parsed-literal:: - model/gelan-c.pt: 0%| | 0.00/49.1M [00:00 List[Node]: + current_directory = os.path.dirname(os.path.abspath(self.state.document.current_source)) + csv_file = os.path.normpath(os.path.join(current_directory, self.options['file'])) + if os.path.isfile(csv_file) is False: + self.warning("Cannot find the specified CSV file. " + "Please provide a correct path.") + csv_node = [] + with open(csv_file, 'r') as j: + csv_data = list(csv.reader(j)) + class_table_tag = ' class="' + "".join(c for c in str(self.options['class']) + '"') if 'class' in self.options is not None else "" + id_table_tag = ' id="' + "".join(c for c in str(self.options['name']) + '"') if 'name' in self.options is not None else "" + hidden_table_tag = ' data-columns-hidden="' + "".join(c for c in str(self.options['hidden']) + '"') if 'hidden' in self.options is not None else "" + csv_table_html = '' + head_rows = 0 + head_rows += self.options.get('header-rows', 0) + row_count = 0 + for row in csv_data[:head_rows]: + row_count += 1 + parity = "row-even" if row_count % 2 == 0 else "row-odd" + csv_table_html += '' + for value in row: + csv_table_html += '

%s

' % value + csv_table_html += '\n' + csv_table_html += '' + for row in csv_data[head_rows:]: + row_count += 1 + parity = "row-even" if row_count % 2 == 0 else "row-odd" + csv_table_html += '' + for value in row: + csv_table_html += '

%s

' % value + csv_table_html += '\n' + csv_table_html += "" + csv_table_html += '' + csv_node.append(nodes.raw(csv_table_html, csv_table_html, format="html")) + + return csv_node \ No newline at end of file diff --git a/docs/optimization_guide/nncf/code/pruning_tf.py b/docs/optimization_guide/nncf/code/pruning_tf.py index 4d2f5018961365..76b76174dc7429 100644 --- a/docs/optimization_guide/nncf/code/pruning_tf.py +++ b/docs/optimization_guide/nncf/code/pruning_tf.py @@ -40,22 +40,22 @@ #! [distributed] #! [tune_model] -... # fine-tuning preparations, e.g. dataset, loss, optimizer setup, etc. +... # fine-tuning preparations, e.g. dataset, loss, optimization setup, etc. # create compression callbacks to control pruning parameters and dump compression statistics -# all the setting are being taked from compression_ctrl, i.e. from NNCF config +# all the setting are being taked from compression_ctrl, i.e. from NNCF config compression_callbacks = create_compression_callbacks(compression_ctrl, log_dir="./compression_log") # tune quantized model for 50 epochs as the baseline -model.fit(train_dataset, epochs=50, callbacks=compression_callbacks) +model.fit(train_dataset, epochs=50, callbacks=compression_callbacks) #! [tune_model] #! [export] compression_ctrl.export_model("compressed_model.pb") #export to Frozen Graph -#! [export] +#! [export] #! [save_checkpoint] -from nncf.tensorflow.utils.state import TFCompressionState +from nncf.tensorflow.utils.state import TFCompressionState from nncf.tensorflow.callbacks.checkpoint_callback import CheckpointManagerCallback checkpoint = tf.train.Checkpoint(model=model, diff --git a/docs/optimization_guide/nncf/code/pruning_torch.py b/docs/optimization_guide/nncf/code/pruning_torch.py index 6bc1cae4319406..6b637881b5cfc9 100644 --- a/docs/optimization_guide/nncf/code/pruning_torch.py +++ b/docs/optimization_guide/nncf/code/pruning_torch.py @@ -30,7 +30,7 @@ #! [nncf_congig] #! [wrap_model] -model = TorchModel() # instance of torch.nn.Module +model = TorchModel() # instance of torch.nn.Module compression_ctrl, model = create_compressed_model(model, nncf_config) #! [wrap_model] @@ -39,7 +39,7 @@ #! [distributed] #! [tune_model] -... # fine-tuning preparations, e.g. dataset, loss, optimizer setup, etc. +... # fine-tuning preparations, e.g. dataset, loss, optimization setup, etc. # tune quantized model for 50 epochs as the baseline for epoch in range(0, 50): @@ -52,7 +52,7 @@ #! [export] compression_ctrl.export_model("compressed_model.onnx") -#! [export] +#! [export] #! [save_checkpoint] checkpoint = { @@ -65,8 +65,8 @@ #! [load_checkpoint] resuming_checkpoint = torch.load(path_to_checkpoint) -compression_state = resuming_checkpoint['compression_state'] +compression_state = resuming_checkpoint['compression_state'] compression_ctrl, model = create_compressed_model(model, nncf_config, compression_state=compression_state) -state_dict = resuming_checkpoint['state_dict'] +state_dict = resuming_checkpoint['state_dict'] model.load_state_dict(state_dict) #! [load_checkpoint] diff --git a/docs/optimization_guide/nncf/code/qat_tf.py b/docs/optimization_guide/nncf/code/qat_tf.py index e210b963d5a8f6..d8a20958cfbcc2 100644 --- a/docs/optimization_guide/nncf/code/qat_tf.py +++ b/docs/optimization_guide/nncf/code/qat_tf.py @@ -20,8 +20,8 @@ #! [nncf_congig] #! [wrap_model] -model = KerasModel() # instance of the tensorflow.keras.Model -compression_ctrl, model = create_compressed_model(model, nncf_config) +model = KerasModel() # instance of the tensorflow.keras.Model +compression_ctrl, model = create_compressed_model(model, nncf_config) #! [wrap_model] #! [distributed] @@ -29,7 +29,7 @@ #! [distributed] #! [tune_model] -... # fine-tuning preparations, e.g. dataset, loss, optimizer setup, etc. +... # fine-tuning preparations, e.g. dataset, loss, optimization setup, etc. # create compression callbacks to control optimization parameters and dump compression statistics compression_callbacks = create_compression_callbacks(compression_ctrl, log_dir="./compression_log") @@ -39,10 +39,10 @@ #! [export] compression_ctrl.export_model("compressed_model.pb") #export to Frozen Graph -#! [export] +#! [export] #! [save_checkpoint] -from nncf.tensorflow.utils.state import TFCompressionState +from nncf.tensorflow.utils.state import TFCompressionState from nncf.tensorflow.callbacks.checkpoint_callback import CheckpointManagerCallback checkpoint = tf.train.Checkpoint(model=model, diff --git a/docs/optimization_guide/nncf/code/qat_torch.py b/docs/optimization_guide/nncf/code/qat_torch.py index f80a7e8f9aea9f..71594635cb84fd 100644 --- a/docs/optimization_guide/nncf/code/qat_torch.py +++ b/docs/optimization_guide/nncf/code/qat_torch.py @@ -7,7 +7,7 @@ #! [quantize] #! [tune_model] -... # fine-tuning preparations, e.g. dataset, loss, optimizer setup, etc. +... # fine-tuning preparations, e.g. dataset, loss, optimization setup, etc. # tune quantized model for 5 epochs as the baseline for epoch in range(0, 5): diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-Performance-Data.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-Performance-Data.xlsx deleted file mode 100644 index e5a6a4b039b029..00000000000000 Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-Performance-Data.xlsx and /dev/null differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-system-info-detailed.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-system-info-detailed.xlsx deleted file mode 100644 index 60150e1a457eaf..00000000000000 Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-system-info-detailed.xlsx and /dev/null differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-Performance-Data.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-Performance-Data.xlsx new file mode 100644 index 00000000000000..18d7f85164c73e Binary files /dev/null and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-Performance-Data.xlsx differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-platform_list.pdf b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-platform_list.pdf similarity index 51% rename from docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-platform_list.pdf rename to docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-platform_list.pdf index 9cb20570020cc8..0278be39133953 100644 Binary files a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.5-platform_list.pdf and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-platform_list.pdf differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-system-info-detailed.xlsx b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-system-info-detailed.xlsx new file mode 100644 index 00000000000000..e4f41c2f778662 Binary files /dev/null and b/docs/sphinx_setup/_static/benchmarks_files/OV-2024.6-system-info-detailed.xlsx differ diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ov.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ov.json index 44b5b5707042df..c5cfca9df3f095 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ov.json +++ b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ov.json @@ -1,18 +1,18 @@ [ { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", "Model": "bert-base-cased", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 312.06, - "fp16": 345.49, - "fp32": "", + "int8": 245.16, + "fp16": "", + "fp32": 159.59, "bf16": "" } ], @@ -23,7 +23,7 @@ "Precisions": [ { "int4": "", - "int8": 4.83, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -35,19 +35,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 328.55, - "fp16": 285.3, - "fp32": "", + "int8": 2.49, + "fp16": "", + "fp32": 1.18, "bf16": "" } ], @@ -70,19 +70,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "gemma-2-9b", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", + "Model": "mobilenet-v2", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 20.07, - "int8": 17.42, + "int4": "", + "int8": 4444.97, "fp16": "", - "fp32": "", + "fp32": 2395.07, "bf16": "" } ], @@ -92,8 +92,8 @@ "latency": { "Precisions": [ { - "int4": 49.81, - "int8": 57.4, + "int4": "", + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -105,19 +105,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "glm-4-9b-chat", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", + "Model": "resnet-50", "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Accelerator Platforms", + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 36.48, - "int8": 27.59, + "int4": "", + "int8": 1071.42, "fp16": "", - "fp32": "", + "fp32": 472.1, "bf16": "" } ], @@ -127,8 +127,8 @@ "latency": { "Precisions": [ { - "int4": 27.41, - "int8": 36.24, + "int4": "", + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -140,19 +140,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "llama-2-7b-chat", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", + "Model": "ssd-resnet34-1200", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 42.82, - "int8": 33.97, - "fp16": 22.23, - "fp32": "", + "int4": "", + "int8": "", + "fp16": "", + "fp32": 9.59, "bf16": "" } ], @@ -162,9 +162,9 @@ "latency": { "Precisions": [ { - "int4": 23.35, - "int8": 29.43, - "fp16": 44.97, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -175,19 +175,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "llama-3-8b", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Accelerator Platforms", + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 39.6, - "int8": 30.59, + "int4": "", + "int8": 1104.79, "fp16": "", - "fp32": "", + "fp32": 622.5, "bf16": "" } ], @@ -197,8 +197,8 @@ "latency": { "Precisions": [ { - "int4": 25.25, - "int8": 32.69, + "int4": "", + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -210,19 +210,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "llama-3.2-3b-instruct", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", + "Model": "yolo11", "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 55.37, - "int8": 51.62, - "fp16": 35.82, - "fp32": "", + "int4": "", + "int8": "", + "fp16": "", + "fp32": 275.86, "bf16": "" } ], @@ -232,9 +232,9 @@ "latency": { "Precisions": [ { - "int4": 18.06, - "int8": 19.37, - "fp16": 27.91, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -245,19 +245,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", + "Model": "yolo_v8n", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 34.84, - "fp16": 19.43, - "fp32": "", + "int8": 376.6, + "fp16": "", + "fp32": 235.44, "bf16": "" } ], @@ -268,7 +268,7 @@ "Precisions": [ { "int4": "", - "int8": 48.51, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -280,19 +280,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "mistral-7b-v0.1", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", + "Model": "bert-base-cased", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 43.4, - "int8": 32.32, - "fp16": 20.91, - "fp32": "", + "int4": "", + "int8": 75.65, + "fp16": "", + "fp32": 30.06, "bf16": "" } ], @@ -302,9 +302,9 @@ "latency": { "Precisions": [ { - "int4": 23.04, - "int8": 30.94, - "fp16": 47.82, + "int4": "", + "int8": 24.61, + "fp16": "", "fp32": "", "bf16": "" } @@ -315,19 +315,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", + "Model": "efficientdet-d0", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2348.6, - "fp16": 2074.34, - "fp32": "", + "int8": 96.99, + "fp16": "", + "fp32": 66.54, "bf16": "" } ], @@ -338,7 +338,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 20.2, "fp16": "", "fp32": "", "bf16": "" @@ -350,19 +350,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "phi-3-mini-4k-instruct", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 59.06, - "int8": 47.96, - "fp16": 29.29, - "fp32": "", + "int4": "", + "int8": 1.2, + "fp16": "", + "fp32": 0.3, "bf16": "" } ], @@ -372,9 +372,9 @@ "latency": { "Precisions": [ { - "int4": 16.93, - "int8": 20.85, - "fp16": 34.14, + "int4": "", + "int8": 973.29, + "fp16": "", "fp32": "", "bf16": "" } @@ -385,19 +385,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "qwen2-7b", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", + "Model": "mobilenet-v2", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 40.48, - "int8": 32.79, - "fp16": 20.67, - "fp32": "", + "int4": "", + "int8": 1955.91, + "fp16": "", + "fp32": 810.96, "bf16": "" } ], @@ -407,9 +407,9 @@ "latency": { "Precisions": [ { - "int4": 24.7, - "int8": 30.49, - "fp16": 48.37, + "int4": "", + "int8": 1.36, + "fp16": "", "fp32": "", "bf16": "" } @@ -420,19 +420,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", "Model": "resnet-50", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1401.85, - "fp16": 1046.9, - "fp32": "", + "int8": 389.01, + "fp16": "", + "fp32": 94.24, "bf16": "" } ], @@ -443,7 +443,7 @@ "Precisions": [ { "int4": "", - "int8": 1.42, + "int8": 6.25, "fp16": "", "fp32": "", "bf16": "" @@ -455,19 +455,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", "Model": "ssd-resnet34-1200", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 112.21, - "fp16": 73.01, - "fp32": "", + "int8": 6.3, + "fp16": "", + "fp32": 1.63, "bf16": "" } ], @@ -478,7 +478,7 @@ "Precisions": [ { "int4": "", - "int8": 14.86, + "int8": 199.13, "fp16": "", "fp32": "", "bf16": "" @@ -490,19 +490,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", "Model": "ssd_mobilenet_v1_coco", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1308.1, - "fp16": 1201.69, - "fp32": "", + "int8": 678.1, + "fp16": "", + "fp32": 241.46, "bf16": "" } ], @@ -513,7 +513,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 2.67, "fp16": "", "fp32": "", "bf16": "" @@ -525,11 +525,11 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", - "Model": "stable-diffusion-v1-5", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", + "Model": "yolo11", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -537,7 +537,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": "", + "fp32": 77.9, "bf16": "" } ], @@ -560,19 +560,19 @@ } }, { - "Platform": "Intel® Arc™ A-Series Graphics dGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", "Model": "yolo_v8n", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 517.1, - "fp16": 550.33, - "fp32": "", + "int8": 166.06, + "fp16": "", + "fp32": 64.29, "bf16": "" } ], @@ -583,7 +583,7 @@ "Precisions": [ { "int4": "", - "int8": 3.21, + "int8": 12.57, "fp16": "", "fp32": "", "bf16": "" @@ -595,19 +595,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU+iGPU", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", + "Model": "bert-base-cased", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 23.3, - "fp16": "", - "fp32": 23.72, + "int8": 90.01, + "fp16": 73.98, + "fp32": "", "bf16": "" } ], @@ -618,7 +618,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 11.94, "fp16": "", "fp32": "", "bf16": "" @@ -630,19 +630,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU+iGPU", - "Model": "mobilenet-v2", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", + "Model": "efficientdet-d0", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 228.97, - "fp16": "", - "fp32": 219.37, + "int8": 40.21, + "fp16": 36.53, + "fp32": "", "bf16": "" } ], @@ -653,7 +653,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 25.76, "fp16": "", "fp32": "", "bf16": "" @@ -665,19 +665,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU+iGPU", - "Model": "resnet-50", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", + "Model": "mobilenet-v2", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 59.38, - "fp16": "", - "fp32": 54.24, + "int8": 1964.23, + "fp16": 1350.52, + "fp32": "", "bf16": "" } ], @@ -688,7 +688,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.77, "fp16": "", "fp32": "", "bf16": "" @@ -700,19 +700,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU+iGPU", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", + "Model": "resnet-50", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1.26, - "fp16": "", - "fp32": 1.08, + "int8": 774.88, + "fp16": 383.87, + "fp32": "", "bf16": "" } ], @@ -723,7 +723,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.56, "fp16": "", "fp32": "", "bf16": "" @@ -735,19 +735,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 111.92, + "int8": 699.97, "fp16": "", - "fp32": 98.44, + "fp32": "", "bf16": "" } ], @@ -758,7 +758,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.66, "fp16": "", "fp32": "", "bf16": "" @@ -770,11 +770,11 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU+iGPU", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -782,7 +782,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": 34.99, + "fp32": 138.08, "bf16": "" } ], @@ -805,19 +805,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", "Model": "yolo_v8n", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 36.35, - "fp16": "", - "fp32": 33.97, + "int8": 138.71, + "fp16": 142.27, + "fp32": "", "bf16": "" } ], @@ -828,7 +828,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 7.93, "fp16": "", "fp32": "", "bf16": "" @@ -839,32 +839,32 @@ } } }, - { - "Platform": "Intel® Atom® X6425E CPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, + { + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", + "Model": "llama-2-7b-chat", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 7.26, - "fp16": "", - "fp32": 5.01, + "int8": "", + "fp16": 3.12, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 139.68, - "fp16": "", + "int8": "", + "fp16": 320.24, "fp32": "", "bf16": "" } @@ -875,31 +875,31 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", + "Model": "phi-3-mini-4k-instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 134.16, - "fp16": "", - "fp32": 80.45, + "int8": 0.5, + "fp16": 4.8, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 7.8, - "fp16": "", + "int8": 1963.5, + "fp16": 208.07, "fp32": "", "bf16": "" } @@ -909,20 +909,20 @@ } } }, - { - "Platform": "Intel® Atom® X6425E CPU-only", - "Model": "resnet-50", - "featured_SKU": false, + { + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "bert-base-cased", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 19.87, + "int8": 219.48, "fp16": "", - "fp32": 8.15, + "fp32": 159.21, "bf16": "" } ], @@ -933,7 +933,7 @@ "Precisions": [ { "int4": "", - "int8": 51.33, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -945,19 +945,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "efficientdet-d0", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.33, + "int8": 188.23, "fp16": "", - "fp32": 0.13, + "fp32": 149.87, "bf16": "" } ], @@ -968,7 +968,7 @@ "Precisions": [ { "int4": "", - "int8": 2995.1, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -980,19 +980,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 45.84, + "int8": 6.52, "fp16": "", - "fp32": 21.63, + "fp32": 3.46, "bf16": "" } ], @@ -1003,7 +1003,7 @@ "Precisions": [ { "int4": "", - "int8": 22.72, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1015,19 +1015,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU-only", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU-only", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "mobilenet-v2", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 1821.48, "fp16": "", - "fp32": 5.3, + "fp32": 1146.43, "bf16": "" } ], @@ -1050,19 +1050,19 @@ } }, { - "Platform": "Intel® Atom® X6425E CPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "resnet-50", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 10.31, + "int8": 771.39, "fp16": "", - "fp32": 5.12, + "fp32": 349.5, "bf16": "" } ], @@ -1073,7 +1073,7 @@ "Precisions": [ { "int4": "", - "int8": 99.61, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1085,19 +1085,19 @@ } }, { - "Platform": "Intel® Atom® X6425E iGPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "ssd-resnet34-1200", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 22.02, - "fp16": 25.05, - "fp32": "", + "int8": 34.7, + "fp16": "", + "fp32": 17.75, "bf16": "" } ], @@ -1108,7 +1108,7 @@ "Precisions": [ { "int4": "", - "int8": 60.1, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1120,19 +1120,19 @@ } }, { - "Platform": "Intel® Atom® X6425E iGPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 187.37, - "fp16": 222.58, - "fp32": "", + "int8": 988.89, + "fp16": "", + "fp32": 601.58, "bf16": "" } ], @@ -1143,7 +1143,7 @@ "Precisions": [ { "int4": "", - "int8": 7.71, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1155,19 +1155,19 @@ } }, { - "Platform": "Intel® Atom® X6425E iGPU-only", - "Model": "resnet-50", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "yolo11", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 48.1, - "fp16": 51.68, - "fp32": "", + "int8": "", + "fp16": "", + "fp32": 293.1, "bf16": "" } ], @@ -1178,7 +1178,7 @@ "Precisions": [ { "int4": "", - "int8": 22.89, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1190,19 +1190,19 @@ } }, { - "Platform": "Intel® Atom® X6425E iGPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU+iGPU", + "Model": "yolo_v8n", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1.16, - "fp16": 1.16, - "fp32": "", + "int8": 360.74, + "fp16": "", + "fp32": 260.26, "bf16": "" } ], @@ -1213,7 +1213,7 @@ "Precisions": [ { "int4": "", - "int8": 870.65, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1225,19 +1225,19 @@ } }, { - "Platform": "Intel® Atom® X6425E iGPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", + "Model": "bert-base-cased", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 93.36, - "fp16": 95.62, - "fp32": "", + "int8": 73.65, + "fp16": "", + "fp32": 26.04, "bf16": "" } ], @@ -1248,7 +1248,7 @@ "Precisions": [ { "int4": "", - "int8": 13.54, + "int8": 19.73, "fp16": "", "fp32": "", "bf16": "" @@ -1260,19 +1260,19 @@ } }, { - "Platform": "Intel® Atom® X6425E iGPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", + "Model": "efficientdet-d0", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 31.79, - "fp16": 33.13, - "fp32": "", + "int8": 92.68, + "fp16": "", + "fp32": 60.94, "bf16": "" } ], @@ -1283,7 +1283,7 @@ "Precisions": [ { "int4": "", - "int8": 35.83, + "int8": 13.55, "fp16": "", "fp32": "", "bf16": "" @@ -1295,19 +1295,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU+iGPU", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 39.3, + "int8": 1.11, "fp16": "", - "fp32": 28.97, + "fp32": 0.25, "bf16": "" } ], @@ -1318,7 +1318,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1258.55, "fp16": "", "fp32": "", "bf16": "" @@ -1330,19 +1330,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", "Model": "mobilenet-v2", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 480.45, + "int8": 1664.09, "fp16": "", - "fp32": 302.75, + "fp32": 775.16, "bf16": "" } ], @@ -1353,7 +1353,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.02, "fp16": "", "fp32": "", "bf16": "" @@ -1365,19 +1365,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", "Model": "resnet-50", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 129.7, + "int8": 330.37, "fp16": "", - "fp32": 54.69, + "fp32": 84.22, "bf16": "" } ], @@ -1388,7 +1388,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 4.62, "fp16": "", "fp32": "", "bf16": "" @@ -1400,19 +1400,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2.49, + "int8": 6.04, "fp16": "", - "fp32": 0.86, + "fp32": 1.48, "bf16": "" } ], @@ -1423,7 +1423,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 235.82, "fp16": "", "fp32": "", "bf16": "" @@ -1435,19 +1435,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 233.16, + "int8": 664.52, "fp16": "", - "fp32": 114.81, + "fp32": 225.17, "bf16": "" } ], @@ -1458,7 +1458,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 2.26, "fp16": "", "fp32": "", "bf16": "" @@ -1470,11 +1470,11 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU+iGPU", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -1482,7 +1482,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": 41.37, + "fp32": 71.89, "bf16": "" } ], @@ -1505,19 +1505,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU+iGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V CPU-only", "Model": "yolo_v8n", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 67.73, + "int8": 139.66, "fp16": "", - "fp32": 36.05, + "fp32": 58.69, "bf16": "" } ], @@ -1528,7 +1528,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 9.49, "fp16": "", "fp32": "", "bf16": "" @@ -1540,19 +1540,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "bert-base-cased", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 14.29, - "fp16": "", - "fp32": 11.18, + "int8": 260.56, + "fp16": 194.71, + "fp32": "", "bf16": "" } ], @@ -1563,7 +1563,7 @@ "Precisions": [ { "int4": "", - "int8": 71.84, + "int8": 4.83, "fp16": "", "fp32": "", "bf16": "" @@ -1575,19 +1575,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "efficientdet-d0", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 273.98, - "fp16": "", - "fp32": 169.54, + "int8": 12.97, + "fp16": 13.56, + "fp32": "", "bf16": "" } ], @@ -1598,7 +1598,7 @@ "Precisions": [ { "int4": "", - "int8": 4.05, + "int8": 127.7, "fp16": "", "fp32": "", "bf16": "" @@ -1610,31 +1610,31 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU-only", - "Model": "resnet-50", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "llama-2-7b-chat", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 45.27, - "fp16": "", - "fp32": 18.84, + "int8": "", + "fp16": 5.09, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 23.76, - "fp16": "", + "int8": "", + "fp16": 196.34, "fp32": "", "bf16": "" } @@ -1645,19 +1645,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "mobilenet-v2", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.76, - "fp16": "", - "fp32": 0.31, + "int8": 3708.8, + "fp16": 3055.93, + "fp32": "", "bf16": "" } ], @@ -1668,7 +1668,7 @@ "Precisions": [ { "int4": "", - "int8": 1317.43, + "int8": 0.39, "fp16": "", "fp32": "", "bf16": "" @@ -1680,31 +1680,31 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "phi-3-mini-4k-instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 98.2, - "fp16": "", - "fp32": 45.36, + "int8": 0.5, + "fp16": 8.5, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 10.52, - "fp16": "", + "int8": 1995.15, + "fp16": 117.55, "fp32": "", "bf16": "" } @@ -1715,19 +1715,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU-only", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU-only", + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "resnet-50", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 13.77, + "int8": 2072.98, + "fp16": 998.42, + "fp32": "", "bf16": "" } ], @@ -1738,7 +1738,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.74, "fp16": "", "fp32": "", "bf16": "" @@ -1750,19 +1750,19 @@ } }, { - "Platform": "Intel® Atom® x7425E CPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 21.58, - "fp16": "", - "fp32": 11.78, + "int8": 231.17, + "fp16": 198.89, + "fp32": "", "bf16": "" } ], @@ -1773,7 +1773,7 @@ "Precisions": [ { "int4": "", - "int8": 47.39, + "int8": 7.55, "fp16": "", "fp32": "", "bf16": "" @@ -1785,18 +1785,18 @@ } }, { - "Platform": "Intel® Atom® x7425E iGPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V NPU-only", + "Model": "yolo_v8n", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, NPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 40.0, - "fp16": 34.31, + "int8": 398.63, + "fp16": 498.67, "fp32": "", "bf16": "" } @@ -1808,7 +1808,7 @@ "Precisions": [ { "int4": "", - "int8": 34.13, + "int8": 3.14, "fp16": "", "fp32": "", "bf16": "" @@ -1820,19 +1820,19 @@ } }, { - "Platform": "Intel® Atom® x7425E iGPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 414.66, - "fp16": 324.8, - "fp32": "", + "int8": 35.35, + "fp16": "", + "fp32": 15.52, "bf16": "" } ], @@ -1843,7 +1843,7 @@ "Precisions": [ { "int4": "", - "int8": 3.49, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1855,19 +1855,19 @@ } }, { - "Platform": "Intel® Atom® x7425E iGPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 106.34, - "fp16": 64.69, - "fp32": "", + "int8": 47.59, + "fp16": "", + "fp32": 29.85, "bf16": "" } ], @@ -1878,7 +1878,7 @@ "Precisions": [ { "int4": "", - "int8": 10.56, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1890,19 +1890,19 @@ } }, { - "Platform": "Intel® Atom® x7425E iGPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2.16, - "fp16": 1.32, - "fp32": "", + "int8": 0.5, + "fp16": "", + "fp32": 0.18, "bf16": "" } ], @@ -1913,7 +1913,7 @@ "Precisions": [ { "int4": "", - "int8": 472.59, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1925,19 +1925,19 @@ } }, { - "Platform": "Intel® Atom® x7425E iGPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "mobilenet-v2", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 211.07, - "fp16": 137.13, - "fp32": "", + "int8": 745.03, + "fp16": "", + "fp32": 322.13, "bf16": "" } ], @@ -1948,7 +1948,7 @@ "Precisions": [ { "int4": "", - "int8": 6.2, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1960,19 +1960,19 @@ } }, { - "Platform": "Intel® Atom® x7425E iGPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 60.92, - "fp16": 44.64, - "fp32": "", + "int8": 165.62, + "fp16": "", + "fp32": 52.3, "bf16": "" } ], @@ -1983,7 +1983,7 @@ "Precisions": [ { "int4": "", - "int8": 18.51, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -1995,19 +1995,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 45.34, + "int8": 3.31, "fp16": "", - "fp32": 33.5, + "fp32": 1.08, "bf16": "" } ], @@ -2030,19 +2030,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 57.78, + "int8": 329.75, "fp16": "", - "fp32": 48.75, + "fp32": 113.37, "bf16": "" } ], @@ -2065,19 +2065,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.56, + "int8": 77.32, "fp16": "", - "fp32": 0.51, + "fp32": 35.51, "bf16": "" } ], @@ -2100,19 +2100,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 525.47, + "int8": 31.71, "fp16": "", - "fp32": 392.65, + "fp32": 12.08, "bf16": "" } ], @@ -2123,7 +2123,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 46.54, "fp16": "", "fp32": "", "bf16": "" @@ -2135,19 +2135,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "resnet-50", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 197.41, + "int8": 43.74, "fp16": "", - "fp32": 115.71, + "fp32": 22.34, "bf16": "" } ], @@ -2158,7 +2158,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 33.35, "fp16": "", "fp32": "", "bf16": "" @@ -2170,19 +2170,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 5.38, + "int8": 0.46, "fp16": "", - "fp32": 2.71, + "fp32": "", "bf16": "" } ], @@ -2193,7 +2193,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 2376.04, "fp16": "", "fp32": "", "bf16": "" @@ -2205,19 +2205,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 316.13, + "int8": 758.81, "fp16": "", - "fp32": 194.29, + "fp32": 283.89, "bf16": "" } ], @@ -2228,7 +2228,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.99, "fp16": "", "fp32": "", "bf16": "" @@ -2240,19 +2240,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "yolo11", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "resnet-50", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU+iGPU", + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 151.44, "fp16": "", - "fp32": 80.2, + "fp32": 39.19, "bf16": "" } ], @@ -2263,7 +2263,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 11.4, "fp16": "", "fp32": "", "bf16": "" @@ -2275,19 +2275,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU+iGPU", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 114.67, + "int8": 2.64, "fp16": "", - "fp32": 78.26, + "fp32": 0.75, "bf16": "" } ], @@ -2298,7 +2298,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 506.82, "fp16": "", "fp32": "", "bf16": "" @@ -2310,19 +2310,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 11.77, + "int8": 312.97, "fp16": "", - "fp32": 4.32, + "fp32": 94.2, "bf16": "" } ], @@ -2333,7 +2333,7 @@ "Precisions": [ { "int4": "", - "int8": 87.73, + "int8": 4.81, "fp16": "", "fp32": "", "bf16": "" @@ -2345,19 +2345,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "yolo11", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 18.94, + "int8": "", "fp16": "", - "fp32": 11.49, + "fp32": 31.35, "bf16": "" } ], @@ -2368,7 +2368,7 @@ "Precisions": [ { "int4": "", - "int8": 55.76, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2380,19 +2380,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i5-1235U Processor CPU-only", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.17, + "int8": 67.82, "fp16": "", - "fp32": 0.04, + "fp32": 27.07, "bf16": "" } ], @@ -2403,7 +2403,7 @@ "Precisions": [ { "int4": "", - "int8": 5772.15, + "int8": 20.66, "fp16": "", "fp32": "", "bf16": "" @@ -2415,19 +2415,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 301.05, + "int8": 49.73, "fp16": "", - "fp32": 132.91, + "fp32": 26.8, "bf16": "" } ], @@ -2438,7 +2438,7 @@ "Precisions": [ { "int4": "", - "int8": 3.6, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2450,19 +2450,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 51.66, + "int8": 73.58, "fp16": "", - "fp32": 14.45, + "fp32": 48.37, "bf16": "" } ], @@ -2473,7 +2473,7 @@ "Precisions": [ { "int4": "", - "int8": 19.8, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2485,19 +2485,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.89, + "int8": 0.69, "fp16": "", - "fp32": 0.23, + "fp32": 0.3, "bf16": "" } ], @@ -2508,7 +2508,7 @@ "Precisions": [ { "int4": "", - "int8": 1118.71, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2520,19 +2520,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 115.03, + "int8": 1047.6, "fp16": "", - "fp32": 36.99, + "fp32": 530.76, "bf16": "" } ], @@ -2543,7 +2543,7 @@ "Precisions": [ { "int4": "", - "int8": 9.06, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2555,19 +2555,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "yolo11", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "resnet-50", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU-only", + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 234.13, "fp16": "", - "fp32": 11.94, + "fp32": 87.8, "bf16": "" } ], @@ -2590,19 +2590,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 25.97, + "int8": 4.73, "fp16": "", - "fp32": 9.66, + "fp32": 1.74, "bf16": "" } ], @@ -2613,7 +2613,7 @@ "Precisions": [ { "int4": "", - "int8": 40.21, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2625,19 +2625,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 43.69, - "fp16": 33.8, - "fp32": "", + "int8": 466.04, + "fp16": "", + "fp32": 188.24, "bf16": "" } ], @@ -2648,7 +2648,7 @@ "Precisions": [ { "int4": "", - "int8": 26.56, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2660,19 +2660,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "yolo11", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 73.58, - "fp16": 58.53, - "fp32": "", + "int8": "", + "fp16": "", + "fp32": 65.2, "bf16": "" } ], @@ -2683,7 +2683,7 @@ "Precisions": [ { "int4": "", - "int8": 25.45, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2695,19 +2695,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.48, - "fp16": 0.52, - "fp32": "", + "int8": 124.43, + "fp16": "", + "fp32": 58.02, "bf16": "" } ], @@ -2718,7 +2718,7 @@ "Precisions": [ { "int4": "", - "int8": 2110.65, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -2730,19 +2730,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 671.35, - "fp16": 504.8, - "fp32": "", + "int8": 39.9, + "fp16": "", + "fp32": 15.7, "bf16": "" } ], @@ -2753,7 +2753,7 @@ "Precisions": [ { "int4": "", - "int8": 2.72, + "int8": 40.17, "fp16": "", "fp32": "", "bf16": "" @@ -2765,19 +2765,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 203.17, - "fp16": 118.59, - "fp32": "", + "int8": 56.09, + "fp16": "", + "fp32": 35.64, "bf16": "" } ], @@ -2788,7 +2788,7 @@ "Precisions": [ { "int4": "", - "int8": 6.3, + "int8": 28.71, "fp16": "", "fp32": "", "bf16": "" @@ -2800,19 +2800,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 5.09, - "fp16": 2.78, - "fp32": "", + "int8": 0.57, + "fp16": "", + "fp32": 0.15, "bf16": "" } ], @@ -2823,7 +2823,7 @@ "Precisions": [ { "int4": "", - "int8": 210.41, + "int8": 2072.23, "fp16": "", "fp32": "", "bf16": "" @@ -2835,19 +2835,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 396.07, - "fp16": 221.18, - "fp32": "", + "int8": 949.06, + "fp16": "", + "fp32": 456.56, "bf16": "" } ], @@ -2858,7 +2858,7 @@ "Precisions": [ { "int4": "", - "int8": 4.3, + "int8": 1.75, "fp16": "", "fp32": "", "bf16": "" @@ -2870,19 +2870,19 @@ } }, { - "Platform": "Intel® Celeron® 6305E iGPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 121.77, - "fp16": 81.6, - "fp32": "", + "int8": 184.18, + "fp16": "", + "fp32": 52.94, "bf16": "" } ], @@ -2893,7 +2893,7 @@ "Precisions": [ { "int4": "", - "int8": 10.34, + "int8": 9.62, "fp16": "", "fp32": "", "bf16": "" @@ -2905,19 +2905,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "bert-base-cased", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 243.99, + "int8": 3.14, "fp16": "", - "fp32": 157.96, + "fp32": 0.91, "bf16": "" } ], @@ -2928,7 +2928,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 466.62, "fp16": "", "fp32": "", "bf16": "" @@ -2940,19 +2940,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "efficientdet-d0", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 189.52, + "int8": 381.86, "fp16": "", - "fp32": 154.61, + "fp32": 133.9, "bf16": "" } ], @@ -2963,7 +2963,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 4.17, "fp16": "", "fp32": "", "bf16": "" @@ -2975,19 +2975,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "yolo11", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2.45, + "int8": "", "fp16": "", - "fp32": 1.19, + "fp32": 43.41, "bf16": "" } ], @@ -3010,19 +3010,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "mobilenet-v2", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-1335U Processor CPU-only", + "Model": "yolo_v8n", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 4485.9, + "int8": 91.26, "fp16": "", - "fp32": 2415.8, + "fp32": 36.14, "bf16": "" } ], @@ -3033,7 +3033,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 18.18, "fp16": "", "fp32": "", "bf16": "" @@ -3045,19 +3045,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "resnet-50", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "bert-base-cased", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1097.16, + "int8": 120.47, "fp16": "", - "fp32": 475.61, + "fp32": 47.12, "bf16": "" } ], @@ -3068,7 +3068,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 13.36, "fp16": "", "fp32": "", "bf16": "" @@ -3080,19 +3080,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "ssd-resnet34-1200", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 18.81, + "int8": 149.82, "fp16": "", - "fp32": 9.71, + "fp32": 92.81, "bf16": "" } ], @@ -3103,7 +3103,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 9.24, "fp16": "", "fp32": "", "bf16": "" @@ -3115,19 +3115,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1120.99, + "int8": 1.61, "fp16": "", - "fp32": 624.14, + "fp32": 0.49, "bf16": "" } ], @@ -3138,7 +3138,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 736.14, "fp16": "", "fp32": "", "bf16": "" @@ -3150,19 +3150,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU+iGPU", - "Model": "yolo_v8n", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "mobilenet-v2", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 374.74, + "int8": 2964.94, "fp16": "", - "fp32": 236.96, + "fp32": 1318.69, "bf16": "" } ], @@ -3173,7 +3173,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.69, "fp16": "", "fp32": "", "bf16": "" @@ -3185,9 +3185,9 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "bert-base-cased", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "resnet-50", + "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { @@ -3195,9 +3195,9 @@ "Precisions": [ { "int4": "", - "int8": 76.15, + "int8": 536.17, "fp16": "", - "fp32": 30.19, + "fp32": 148.8, "bf16": "" } ], @@ -3208,7 +3208,7 @@ "Precisions": [ { "int4": "", - "int8": 25.21, + "int8": 2.82, "fp16": "", "fp32": "", "bf16": "" @@ -3220,9 +3220,9 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "efficientdet-d0", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { @@ -3230,9 +3230,9 @@ "Precisions": [ { "int4": "", - "int8": 97.68, + "int8": 8.77, "fp16": "", - "fp32": 66.63, + "fp32": 2.46, "bf16": "" } ], @@ -3243,7 +3243,7 @@ "Precisions": [ { "int4": "", - "int8": 22.16, + "int8": 133.51, "fp16": "", "fp32": "", "bf16": "" @@ -3255,9 +3255,9 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { @@ -3265,9 +3265,9 @@ "Precisions": [ { "int4": "", - "int8": 1.2, + "int8": 1076.91, "fp16": "", - "fp32": 0.3, + "fp32": 378.11, "bf16": "" } ], @@ -3278,7 +3278,7 @@ "Precisions": [ { "int4": "", - "int8": 1025.52, + "int8": 1.35, "fp16": "", "fp32": "", "bf16": "" @@ -3290,19 +3290,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": true, - "whats_new_model": false, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "yolo11", + "featured_SKU": false, + "whats_new_model": true, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1969.75, + "int8": "", "fp16": "", - "fp32": 815.83, + "fp32": 123.4, "bf16": "" } ], @@ -3313,7 +3313,7 @@ "Precisions": [ { "int4": "", - "int8": 1.36, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3325,9 +3325,9 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "resnet-50", - "featured_SKU": true, + "Platform": "Intel® Core™ i5-13600K CPU-only", + "Model": "yolo_v8n", + "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { @@ -3335,9 +3335,9 @@ "Precisions": [ { "int4": "", - "int8": 390.17, + "int8": 266.68, "fp16": "", - "fp32": 94.82, + "fp32": 102.24, "bf16": "" } ], @@ -3348,7 +3348,7 @@ "Precisions": [ { "int4": "", - "int8": 6.23, + "int8": 5.23, "fp16": "", "fp32": "", "bf16": "" @@ -3360,19 +3360,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "bert-base-cased", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 6.38, + "int8": 84.23, "fp16": "", - "fp32": 1.6, + "fp32": 50.76, "bf16": "" } ], @@ -3383,7 +3383,7 @@ "Precisions": [ { "int4": "", - "int8": 209.14, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3395,19 +3395,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 685.79, + "int8": 99.37, "fp16": "", - "fp32": 242.78, + "fp32": 65.5, "bf16": "" } ], @@ -3418,7 +3418,7 @@ "Precisions": [ { "int4": "", - "int8": 2.71, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3430,19 +3430,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H CPU-only", - "Model": "yolo_v8n", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 166.55, + "int8": 1.15, "fp16": "", - "fp32": 64.31, + "fp32": 0.64, "bf16": "" } ], @@ -3453,7 +3453,7 @@ "Precisions": [ { "int4": "", - "int8": 12.75, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3465,19 +3465,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "bert-base-cased", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "mobilenet-v2", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 88.41, - "fp16": 74.04, - "fp32": "", + "int8": 1350.0, + "fp16": "", + "fp32": 680.17, "bf16": "" } ], @@ -3488,7 +3488,7 @@ "Precisions": [ { "int4": "", - "int8": 12.15, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3500,19 +3500,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "efficientdet-d0", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "resnet-50", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 37.81, - "fp16": 34.74, - "fp32": "", + "int8": 361.15, + "fp16": "", + "fp32": 162.3, "bf16": "" } ], @@ -3523,7 +3523,7 @@ "Precisions": [ { "int4": "", - "int8": 27.47, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3535,19 +3535,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "llama-2-7b-chat", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.27, - "fp16": 2.55, - "fp32": "", + "int8": 8.62, + "fp16": "", + "fp32": 3.78, "bf16": "" } ], @@ -3558,8 +3558,8 @@ "Precisions": [ { "int4": "", - "int8": 3688.24, - "fp16": 390.94, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -3570,19 +3570,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "mobilenet-v2", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1966.11, - "fp16": 1346.18, - "fp32": "", + "int8": 653.43, + "fp16": "", + "fp32": 290.66, "bf16": "" } ], @@ -3593,7 +3593,7 @@ "Precisions": [ { "int4": "", - "int8": 0.79, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3605,19 +3605,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "phi-3-mini-4k-instruct", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "yolo11", + "featured_SKU": false, "whats_new_model": true, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 3.7, - "int8": 0.49, - "fp16": 3.91, - "fp32": "", + "int4": "", + "int8": "", + "fp16": "", + "fp32": 106.63, "bf16": "" } ], @@ -3627,9 +3627,9 @@ "latency": { "Precisions": [ { - "int4": 269.82, - "int8": 2003.58, - "fp16": 255.57, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -3640,19 +3640,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "resnet-50", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Model": "yolo_v8n", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 771.23, - "fp16": 382.83, - "fp32": "", + "int8": 181.26, + "fp16": "", + "fp32": 101.25, "bf16": "" } ], @@ -3663,7 +3663,7 @@ "Precisions": [ { "int4": "", - "int8": 1.58, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -3675,19 +3675,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "bert-base-cased", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 705.76, - "fp16": 453.35, - "fp32": "", + "int8": 50.33, + "fp16": "", + "fp32": 18.21, "bf16": "" } ], @@ -3698,7 +3698,7 @@ "Precisions": [ { "int4": "", - "int8": 1.67, + "int8": 22.72, "fp16": "", "fp32": "", "bf16": "" @@ -3710,19 +3710,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H NPU-only", - "Model": "yolo_v8n", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 126.18, - "fp16": 129.18, - "fp32": "", + "int8": 73.02, + "fp16": "", + "fp32": 41.12, "bf16": "" } ], @@ -3733,7 +3733,7 @@ "Precisions": [ { "int4": "", - "int8": 8.71, + "int8": 14.66, "fp16": "", "fp32": "", "bf16": "" @@ -3745,19 +3745,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "bert-base-cased", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 164.18, - "fp16": 107.12, - "fp32": "", + "int8": 0.72, + "fp16": "", + "fp32": 0.19, "bf16": "" } ], @@ -3768,7 +3768,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1367.22, "fp16": "", "fp32": "", "bf16": "" @@ -3780,19 +3780,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "efficientdet-d0", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "mobilenet-v2", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 195.27, - "fp16": 164.33, - "fp32": "", + "int8": 1279.85, + "fp16": "", + "fp32": 503.42, "bf16": "" } ], @@ -3803,7 +3803,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.96, "fp16": "", "fp32": "", "bf16": "" @@ -3815,19 +3815,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "gemma-2-9b", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "resnet-50", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 8.94, - "int8": "", - "fp16": 0.94, - "fp32": "", + "int4": "", + "int8": 224.43, + "fp16": "", + "fp32": 60.83, "bf16": "" } ], @@ -3837,9 +3837,9 @@ "latency": { "Precisions": [ { - "int4": 111.74, - "int8": "", - "fp16": 1056.4, + "int4": "", + "int8": 4.96, + "fp16": "", "fp32": "", "bf16": "" } @@ -3850,19 +3850,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "glm-4-9b-chat", - "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 10.82, - "int8": 6.3, - "fp16": 1.1, - "fp32": "", + "int4": "", + "int8": 3.93, + "fp16": "", + "fp32": 1.01, "bf16": "" } ], @@ -3872,9 +3872,9 @@ "latency": { "Precisions": [ { - "int4": 92.41, - "int8": 158.68, - "fp16": 906.89, + "int4": "", + "int8": 250.5, + "fp16": "", "fp32": "", "bf16": "" } @@ -3885,19 +3885,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "llama-2-7b-chat", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 14.62, - "int8": 8.53, + "int4": "", + "int8": 492.36, "fp16": "", - "fp32": "", + "fp32": 146.51, "bf16": "" } ], @@ -3907,8 +3907,8 @@ "latency": { "Precisions": [ { - "int4": 68.39, - "int8": 117.1, + "int4": "", + "int8": 2.21, "fp16": "", "fp32": "", "bf16": "" @@ -3920,19 +3920,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "llama-3-8b", - "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "yolo11", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 14.82, - "int8": 7.84, - "fp16": 4.04, - "fp32": "", + "int4": "", + "int8": "", + "fp16": "", + "fp32": 48.14, "bf16": "" } ], @@ -3942,9 +3942,9 @@ "latency": { "Precisions": [ { - "int4": 67.44, - "int8": 127.51, - "fp16": 247.29, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -3955,19 +3955,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "llama-3.2-3b-instruct", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Model": "yolo_v8n", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 26.17, - "int8": 20.38, - "fp16": 10.76, - "fp32": "", + "int4": "", + "int8": 110.17, + "fp16": "", + "fp32": 40.01, "bf16": "" } ], @@ -3977,9 +3977,9 @@ "latency": { "Precisions": [ { - "int4": 38.21, - "int8": 49.06, - "fp16": 92.92, + "int4": "", + "int8": 10.18, + "fp16": "", "fp32": "", "bf16": "" } @@ -3990,19 +3990,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "bert-base-cased", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2.35, - "fp16": 1.58, - "fp32": "", + "int8": 51.12, + "fp16": "", + "fp32": 26.48, "bf16": "" } ], @@ -4013,7 +4013,7 @@ "Precisions": [ { "int4": "", - "int8": 421.72, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4025,19 +4025,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "mistral-7b-v0.1", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 15.03, - "int8": 8.94, + "int4": "", + "int8": 58.22, "fp16": "", - "fp32": "", + "fp32": 28.25, "bf16": "" } ], @@ -4047,8 +4047,8 @@ "latency": { "Precisions": [ { - "int4": 66.52, - "int8": 111.8, + "int4": "", + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4060,19 +4060,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "mobilenet-v2", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1293.98, - "fp16": 1371.59, - "fp32": "", + "int8": 0.72, + "fp16": "", + "fp32": 0.32, "bf16": "" } ], @@ -4095,19 +4095,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "phi-3-mini-4k-instruct", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "mobilenet-v2", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 23.61, - "int8": 18.01, - "fp16": 9.36, - "fp32": "", + "int4": "", + "int8": 977.87, + "fp16": "", + "fp32": 347.74, "bf16": "" } ], @@ -4117,9 +4117,9 @@ "latency": { "Precisions": [ { - "int4": 42.34, - "int8": 55.51, - "fp16": 106.82, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -4130,19 +4130,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "qwen2-7b", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "resnet-50", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 16.68, - "int8": 9.5, + "int4": "", + "int8": 234.93, "fp16": "", - "fp32": "", + "fp32": 87.7, "bf16": "" } ], @@ -4152,8 +4152,8 @@ "latency": { "Precisions": [ { - "int4": 59.95, - "int8": 105.26, + "int4": "", + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4165,19 +4165,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "resnet-50", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 563.96, - "fp16": 416.13, - "fp32": "", + "int8": 4.44, + "fp16": "", + "fp32": 1.77, "bf16": "" } ], @@ -4200,19 +4200,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 21.26, - "fp16": 12.84, - "fp32": "", + "int8": 457.58, + "fp16": "", + "fp32": 163.76, "bf16": "" } ], @@ -4223,7 +4223,7 @@ "Precisions": [ { "int4": "", - "int8": 47.61, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4235,19 +4235,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Model": "yolo11", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1030.66, - "fp16": 811.13, - "fp32": "", + "int8": "", + "fp16": "", + "fp32": 55.7, "bf16": "" } ], @@ -4270,19 +4270,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", + "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", "Model": "yolo_v8n", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 403.44, - "fp16": 306.22, - "fp32": "", + "int8": 103.01, + "fp16": "", + "fp32": 53.55, "bf16": "" } ], @@ -4305,19 +4305,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", "Model": "bert-base-cased", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 223.99, + "int8": 38.29, "fp16": "", - "fp32": 189.97, + "fp32": 13.8, "bf16": "" } ], @@ -4328,7 +4328,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 28.27, "fp16": "", "fp32": "", "bf16": "" @@ -4340,19 +4340,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", "Model": "efficientdet-d0", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 174.87, + "int8": 54.05, "fp16": "", - "fp32": 149.3, + "fp32": 22.12, "bf16": "" } ], @@ -4363,7 +4363,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 20.1, "fp16": "", "fp32": "", "bf16": "" @@ -4375,19 +4375,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 7.24, + "int8": 0.52, "fp16": "", - "fp32": 3.52, + "fp32": 0.13, "bf16": "" } ], @@ -4398,7 +4398,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1784.45, "fp16": "", "fp32": "", "bf16": "" @@ -4410,19 +4410,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", "Model": "mobilenet-v2", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 4846.91, + "int8": 969.46, "fp16": "", - "fp32": 2888.98, + "fp32": 314.77, "bf16": "" } ], @@ -4433,7 +4433,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.2, "fp16": "", "fp32": "", "bf16": "" @@ -4445,19 +4445,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", "Model": "resnet-50", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1975.45, + "int8": 174.87, "fp16": "", - "fp32": 922.35, + "fp32": 45.34, "bf16": "" } ], @@ -4468,7 +4468,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 6.4, "fp16": "", "fp32": "", "bf16": "" @@ -4480,19 +4480,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", "Model": "ssd-resnet34-1200", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 2.63, "fp16": "", - "fp32": 20.97, + "fp32": "", "bf16": "" } ], @@ -4503,7 +4503,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 320.71, "fp16": "", "fp32": "", "bf16": "" @@ -4515,19 +4515,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 385.99, "fp16": "", - "fp32": 585.46, + "fp32": 100.06, "bf16": "" } ], @@ -4538,7 +4538,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 2.81, "fp16": "", "fp32": "", "bf16": "" @@ -4550,19 +4550,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU+iGPU", - "Model": "yolo_v8n", - "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "Platform": "Intel® Core™ i7-1185GRE CPU-only", + "Model": "yolo11", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 343.07, + "int8": "", "fp16": "", - "fp32": 274.85, + "fp32": 31.93, "bf16": "" } ], @@ -4585,9 +4585,9 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "bert-base-cased", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1185GRE CPU-only", + "Model": "yolo_v8n", + "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { @@ -4595,9 +4595,9 @@ "Precisions": [ { "int4": "", - "int8": 44.06, + "int8": 77.3, "fp16": "", - "fp32": 16.03, + "fp32": 27.78, "bf16": "" } ], @@ -4608,7 +4608,7 @@ "Precisions": [ { "int4": "", - "int8": 41.27, + "int8": 13.2, "fp16": "", "fp32": "", "bf16": "" @@ -4620,19 +4620,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "efficientdet-d0", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "bert-base-cased", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 53.32, + "int8": 113.15, "fp16": "", - "fp32": 38.06, + "fp32": 57.03, "bf16": "" } ], @@ -4643,7 +4643,7 @@ "Precisions": [ { "int4": "", - "int8": 28.44, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4655,19 +4655,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.65, + "int8": 141.82, "fp16": "", - "fp32": 0.16, + "fp32": 75.09, "bf16": "" } ], @@ -4678,7 +4678,7 @@ "Precisions": [ { "int4": "", - "int8": 2598.78, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4690,19 +4690,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 917.84, + "int8": 1.62, "fp16": "", - "fp32": 490.87, + "fp32": 0.71, "bf16": "" } ], @@ -4713,7 +4713,7 @@ "Precisions": [ { "int4": "", - "int8": 2.07, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4725,19 +4725,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "resnet-50", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "mobilenet-v2", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 194.09, + "int8": 2306.82, "fp16": "", - "fp32": 52.09, + "fp32": 1147.83, "bf16": "" } ], @@ -4748,7 +4748,7 @@ "Precisions": [ { "int4": "", - "int8": 9.58, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4760,19 +4760,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "resnet-50", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3.52, + "int8": 528.4, "fp16": "", - "fp32": 0.87, + "fp32": 183.43, "bf16": "" } ], @@ -4783,7 +4783,7 @@ "Precisions": [ { "int4": "", - "int8": 493.86, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4795,19 +4795,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 380.37, + "int8": 10.4, "fp16": "", - "fp32": 135.96, + "fp32": 3.75, "bf16": "" } ], @@ -4818,7 +4818,7 @@ "Precisions": [ { "int4": "", - "int8": 4.64, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4830,19 +4830,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V CPU-only", - "Model": "yolo_v8n", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 80.52, + "int8": 1014.24, "fp16": "", - "fp32": 34.88, + "fp32": 400.36, "bf16": "" } ], @@ -4853,7 +4853,7 @@ "Precisions": [ { "int4": "", - "int8": 20.34, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4865,19 +4865,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "bert-base-cased", - "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "yolo11", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 265.97, - "fp16": 198.16, - "fp32": "", + "int8": "", + "fp16": "", + "fp32": 131.95, "bf16": "" } ], @@ -4888,7 +4888,7 @@ "Precisions": [ { "int4": "", - "int8": 5.25, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4900,19 +4900,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "efficientdet-d0", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU+iGPU", + "Model": "yolo_v8n", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 13.69, - "fp16": 13.65, - "fp32": "", + "int8": 266.22, + "fp16": "", + "fp32": 120.83, "bf16": "" } ], @@ -4923,7 +4923,7 @@ "Precisions": [ { "int4": "", - "int8": 119.56, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -4935,19 +4935,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "llama-2-7b-chat", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "bert-base-cased", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.24, - "fp16": 4.4, - "fp32": "", + "int8": 87.91, + "fp16": "", + "fp32": 34.69, "bf16": "" } ], @@ -4958,8 +4958,8 @@ "Precisions": [ { "int4": "", - "int8": 4094.9, - "fp16": 226.87, + "int8": 16.29, + "fp16": "", "fp32": "", "bf16": "" } @@ -4970,19 +4970,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "mobilenet-v2", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3799.36, - "fp16": 3178.95, - "fp32": "", + "int8": 115.7, + "fp16": "", + "fp32": 61.66, "bf16": "" } ], @@ -4993,7 +4993,7 @@ "Precisions": [ { "int4": "", - "int8": 0.46, + "int8": 11.66, "fp16": "", "fp32": "", "bf16": "" @@ -5005,19 +5005,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "phi-3-mini-4k-instruct", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, NPU-only", + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 3.31, - "int8": 0.72, - "fp16": 6.86, - "fp32": "", + "int4": "", + "int8": 1.27, + "fp16": "", + "fp32": 0.36, "bf16": "" } ], @@ -5027,9 +5027,9 @@ "latency": { "Precisions": [ { - "int4": 301.49, - "int8": 1378.29, - "fp16": 145.76, + "int4": "", + "int8": 890.38, + "fp16": "", "fp32": "", "bf16": "" } @@ -5040,19 +5040,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "resnet-50", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "mobilenet-v2", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2161.26, - "fp16": 948.32, - "fp32": "", + "int8": 1984.09, + "fp16": "", + "fp32": 970.61, "bf16": "" } ], @@ -5063,7 +5063,7 @@ "Precisions": [ { "int4": "", - "int8": 0.79, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -5075,19 +5075,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "resnet-50", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 230.18, - "fp16": 192.78, - "fp32": "", + "int8": 429.37, + "fp16": "", + "fp32": 109.95, "bf16": "" } ], @@ -5098,7 +5098,7 @@ "Precisions": [ { "int4": "", - "int8": 8.29, + "int8": 3.48, "fp16": "", "fp32": "", "bf16": "" @@ -5110,19 +5110,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V NPU-only", - "Model": "yolo_v8n", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, NPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 401.12, - "fp16": 497.56, - "fp32": "", + "int8": 7.12, + "fp16": "", + "fp32": 1.96, "bf16": "" } ], @@ -5133,7 +5133,7 @@ "Precisions": [ { "int4": "", - "int8": 3.97, + "int8": 158.86, "fp16": "", "fp32": "", "bf16": "" @@ -5145,19 +5145,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "bert-base-cased", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 225.83, - "fp16": 298.39, - "fp32": "", + "int8": 850.51, + "fp16": "", + "fp32": 290.18, "bf16": "" } ], @@ -5168,7 +5168,7 @@ "Precisions": [ { "int4": "", - "int8": 3.93, + "int8": 1.71, "fp16": "", "fp32": "", "bf16": "" @@ -5180,19 +5180,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "efficientdet-d0", - "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "yolo11", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 114.57, - "fp16": 121.87, - "fp32": "", + "int8": "", + "fp16": "", + "fp32": 90.98, "bf16": "" } ], @@ -5203,7 +5203,7 @@ "Precisions": [ { "int4": "", - "int8": 10.22, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -5215,19 +5215,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "gemma-2-9b", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-12700H CPU-only", + "Model": "yolo_v8n", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 14.49, - "int8": 8.34, - "fp16": 0.59, - "fp32": "", + "int4": "", + "int8": 206.01, + "fp16": "", + "fp32": 77.63, "bf16": "" } ], @@ -5237,9 +5237,9 @@ "latency": { "Precisions": [ { - "int4": 68.99, - "int8": 119.77, - "fp16": 1691.52, + "int4": "", + "int8": 6.48, + "fp16": "", "fp32": "", "bf16": "" } @@ -5250,19 +5250,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "glm-4-9b-chat", - "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "bert-base-cased", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 17.63, - "int8": 9.8, - "fp16": 0.71, - "fp32": "", + "int4": "", + "int8": 60.89, + "fp16": "", + "fp32": 32.08, "bf16": "" } ], @@ -5272,9 +5272,9 @@ "latency": { "Precisions": [ { - "int4": 56.72, - "int8": 102.04, - "fp16": 1402.74, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -5285,19 +5285,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "llama-2-7b-chat", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 18.48, - "int8": 11.87, - "fp16": 6.44, - "fp32": "", + "int4": "", + "int8": 87.91, + "fp16": "", + "fp32": 58.38, "bf16": "" } ], @@ -5307,9 +5307,9 @@ "latency": { "Precisions": [ { - "int4": 54.09, - "int8": 84.18, - "fp16": 155.17, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -5320,18 +5320,18 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "llama-3-8b", - "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 20.41, - "int8": 11.07, - "fp16": 5.81, + "int4": "", + "int8": 0.81, + "fp16": "", "fp32": "", "bf16": "" } @@ -5342,9 +5342,9 @@ "latency": { "Precisions": [ { - "int4": 48.98, - "int8": 90.29, - "fp16": 171.98, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -5355,19 +5355,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "llama-3.2-3b-instruct", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "mobilenet-v2", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 36.58, - "int8": 23.94, - "fp16": 12.86, - "fp32": "", + "int4": "", + "int8": 1202.56, + "fp16": "", + "fp32": 636.0, "bf16": "" } ], @@ -5377,9 +5377,9 @@ "latency": { "Precisions": [ { - "int4": 27.33, - "int8": 41.77, - "fp16": 77.71, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -5390,19 +5390,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "resnet-50", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 10.4, - "fp16": 5.7, - "fp32": "", + "int8": 282.14, + "fp16": "", + "fp32": 108.19, "bf16": "" } ], @@ -5413,7 +5413,7 @@ "Precisions": [ { "int4": "", - "int8": 109.21, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -5425,19 +5425,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "mistral-7b-v0.1", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "ssd-resnet34-1200", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 20.06, - "int8": 11.6, - "fp16": 6.05, - "fp32": "", + "int4": "", + "int8": 5.67, + "fp16": "", + "fp32": 2.14, "bf16": "" } ], @@ -5447,9 +5447,9 @@ "latency": { "Precisions": [ { - "int4": 49.85, - "int8": 86.18, - "fp16": 165.15, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -5460,19 +5460,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "mobilenet-v2", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1007.75, - "fp16": 862.8, - "fp32": "", + "int8": 549.37, + "fp16": "", + "fp32": 228.26, "bf16": "" } ], @@ -5483,7 +5483,7 @@ "Precisions": [ { "int4": "", - "int8": 1.2, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -5495,19 +5495,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "phi-3-mini-4k-instruct", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "yolo11", + "featured_SKU": false, "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { - "int4": 31.27, - "int8": 20.55, - "fp16": 11.04, - "fp32": "", + "int4": "", + "int8": "", + "fp16": "", + "fp32": 79.72, "bf16": "" } ], @@ -5517,9 +5517,9 @@ "latency": { "Precisions": [ { - "int4": 31.97, - "int8": 48.66, - "fp16": 90.57, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -5530,19 +5530,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "qwen2-7b", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": 20.99, - "int8": 12.69, - "fp16": 6.07, - "fp32": "", + "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", + "Model": "yolo_v8n", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, CPU+iGPU", + "Parameters": { + "throughput": { + "Precisions": [ + { + "int4": "", + "int8": 152.16, + "fp16": "", + "fp32": 71.37, "bf16": "" } ], @@ -5552,9 +5552,9 @@ "latency": { "Precisions": [ { - "int4": 47.64, - "int8": 78.78, - "fp16": 164.54, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -5565,19 +5565,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "resnet-50", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "bert-base-cased", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 830.46, - "fp16": 585.38, - "fp32": "", + "int8": 44.38, + "fp16": "", + "fp32": 17.86, "bf16": "" } ], @@ -5588,7 +5588,7 @@ "Precisions": [ { "int4": "", - "int8": 1.23, + "int8": 37.68, "fp16": "", "fp32": "", "bf16": "" @@ -5600,19 +5600,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "efficientdet-d0", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 57.99, - "fp16": 32.18, - "fp32": "", + "int8": 61.43, + "fp16": "", + "fp32": 39.3, "bf16": "" } ], @@ -5623,7 +5623,7 @@ "Precisions": [ { "int4": "", - "int8": 26.21, + "int8": 27.06, "fp16": "", "fp32": "", "bf16": "" @@ -5635,19 +5635,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 485.85, - "fp16": 555.71, - "fp32": "", + "int8": 0.63, + "fp16": "", + "fp32": 0.17, "bf16": "" } ], @@ -5658,7 +5658,7 @@ "Precisions": [ { "int4": "", - "int8": 1.75, + "int8": 1970.66, "fp16": "", "fp32": "", "bf16": "" @@ -5670,19 +5670,19 @@ } }, { - "Platform": "Intel® Core™ Ultra 7 processor 268V iGPU-only", - "Model": "yolo_v8n", - "featured_SKU": true, + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "mobilenet-v2", + "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 362.75, - "fp16": 375.06, - "fp32": "", + "int8": 1035.64, + "fp16": "", + "fp32": 515.95, "bf16": "" } ], @@ -5693,7 +5693,7 @@ "Precisions": [ { "int4": "", - "int8": 3.3, + "int8": 1.62, "fp16": "", "fp32": "", "bf16": "" @@ -5705,19 +5705,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 34.21, + "int8": 200.99, "fp16": "", - "fp32": 15.71, + "fp32": 58.72, "bf16": "" } ], @@ -5728,7 +5728,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 9.03, "fp16": "", "fp32": "", "bf16": "" @@ -5740,19 +5740,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 47.95, + "int8": 3.43, "fp16": "", - "fp32": 29.38, + "fp32": 1.02, "bf16": "" } ], @@ -5763,7 +5763,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 445.79, "fp16": "", "fp32": "", "bf16": "" @@ -5775,19 +5775,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.5, + "int8": 418.13, "fp16": "", - "fp32": 0.18, + "fp32": 150.89, "bf16": "" } ], @@ -5798,7 +5798,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.89, "fp16": "", "fp32": "", "bf16": "" @@ -5810,19 +5810,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "yolo11", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 742.67, + "int8": "", "fp16": "", - "fp32": 331.98, + "fp32": 48.48, "bf16": "" } ], @@ -5845,19 +5845,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "resnet-50", + "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 162.84, + "int8": 100.45, "fp16": "", - "fp32": 51.66, + "fp32": 40.52, "bf16": "" } ], @@ -5868,7 +5868,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 17.12, "fp16": "", "fp32": "", "bf16": "" @@ -5880,19 +5880,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 169.91, "fp16": "", - "fp32": 1.03, + "fp32": 67.83, "bf16": "" } ], @@ -5903,7 +5903,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 10.74, "fp16": "", "fp32": "", "bf16": "" @@ -5915,19 +5915,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 328.29, + "int8": 222.19, "fp16": "", - "fp32": 115.41, + "fp32": 126.81, "bf16": "" } ], @@ -5938,7 +5938,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 7.32, "fp16": "", "fp32": "", "bf16": "" @@ -5950,31 +5950,31 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "yolo11", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "gemma-2-9b", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", - "fp32": 41.68, + "int4": 9.57, + "int8": 6.99, + "fp16": 3.59, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 104.39, + "int8": 143.05, + "fp16": 277.85, "fp32": "", "bf16": "" } @@ -5985,31 +5985,31 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU+iGPU", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "glm-4-9b-chat", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": "false", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 79.4, - "fp16": "", - "fp32": 35.44, + "int4": 10.66, + "int8": 7.49, + "fp16": 3.82, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 93.74, + "int8": 133.46, + "fp16": 261.55, "fp32": "", "bf16": "" } @@ -6020,8 +6020,8 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "llama-2-7b-chat", "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", @@ -6029,22 +6029,22 @@ "throughput": { "Precisions": [ { - "int4": "", - "int8": 31.55, - "fp16": "", - "fp32": 12.38, + "int4": 14.22, + "int8": 9.66, + "fp16": 4.96, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 46.55, - "fp16": "", + "int4": 70.32, + "int8": 103.45, + "fp16": 201.53, "fp32": "", "bf16": "" } @@ -6055,31 +6055,31 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "llama-3-8b", "featured_SKU": false, - "whats_new_model": false, + "whats_new_model": "false", "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 43.39, - "fp16": "", - "fp32": 23.14, + "int4": 12.04, + "int8": 8.7, + "fp16": 4.48, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 33.09, - "fp16": "", + "int4": 83.05, + "int8": 114.86, + "fp16": 223.06, "fp32": "", "bf16": "" } @@ -6090,31 +6090,31 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "llama-3.2-3b-instruct", "featured_SKU": false, - "whats_new_model": false, + "whats_new_model": true, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 0.45, - "fp16": "", - "fp32": 0.12, + "int4": 27.06, + "int8": 19.19, + "fp16": 10.21, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 2440.72, - "fp16": "", + "int4": 36.95, + "int8": 52.11, + "fp16": 97.91, "fp32": "", "bf16": "" } @@ -6125,8 +6125,8 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", @@ -6135,9 +6135,9 @@ "Precisions": [ { "int4": "", - "int8": 789.02, + "int8": 2.49, "fp16": "", - "fp32": "", + "fp32": 0.71, "bf16": "" } ], @@ -6148,7 +6148,7 @@ "Precisions": [ { "int4": "", - "int8": 1.9, + "int8": 563.36, "fp16": "", "fp32": "", "bf16": "" @@ -6160,8 +6160,8 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "mistral-7b-v0.1", "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", @@ -6169,22 +6169,22 @@ "throughput": { "Precisions": [ { - "int4": "", - "int8": 147.74, - "fp16": "", - "fp32": 38.84, + "int4": 14.65, + "int8": 9.16, + "fp16": 4.72, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 11.4, - "fp16": "", + "int4": 68.23, + "int8": 109.06, + "fp16": 211.82, "fp32": "", "bf16": "" } @@ -6195,8 +6195,8 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", @@ -6205,9 +6205,9 @@ "Precisions": [ { "int4": "", - "int8": 2.66, + "int8": 4254.4, "fp16": "", - "fp32": 0.77, + "fp32": 2049.21, "bf16": "" } ], @@ -6218,7 +6218,7 @@ "Precisions": [ { "int4": "", - "int8": 511.09, + "int8": 0.6, "fp16": "", "fp32": "", "bf16": "" @@ -6230,31 +6230,31 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "phi-3-mini-4k-instruct", "featured_SKU": false, - "whats_new_model": false, + "whats_new_model": true, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 313.17, - "fp16": "", - "fp32": 95.81, + "int4": 19.2, + "int8": 16.44, + "fp16": 8.59, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 4.81, - "fp16": "", + "int4": 52.06, + "int8": 60.81, + "fp16": 116.35, "fp32": "", "bf16": "" } @@ -6265,31 +6265,31 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "yolo11", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "qwen2-7b", "featured_SKU": false, - "whats_new_model": "false", + "whats_new_model": true, "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", - "fp32": 31.84, + "int4": 13.12, + "int8": 9.29, + "fp16": 4.76, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 76.19, + "int8": 107.6, + "fp16": 209.76, "fp32": "", "bf16": "" } @@ -6300,8 +6300,8 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, "PlatformType": "Intel® Core™, CPU-only", @@ -6310,9 +6310,9 @@ "Precisions": [ { "int4": "", - "int8": 67.43, + "int8": 762.92, "fp16": "", - "fp32": 26.68, + "fp32": 233.76, "bf16": "" } ], @@ -6323,7 +6323,7 @@ "Precisions": [ { "int4": "", - "int8": 20.62, + "int8": 2.18, "fp16": "", "fp32": "", "bf16": "" @@ -6335,19 +6335,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 46.15, - "fp16": 38.3, - "fp32": "", + "int8": 13.03, + "fp16": "", + "fp32": 3.84, "bf16": "" } ], @@ -6358,7 +6358,7 @@ "Precisions": [ { "int4": "", - "int8": 19.82, + "int8": 102.76, "fp16": "", "fp32": "", "bf16": "" @@ -6370,19 +6370,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 64.24, - "fp16": 50.43, - "fp32": "", + "int8": 1607.47, + "fp16": "", + "fp32": 590.49, "bf16": "" } ], @@ -6393,7 +6393,7 @@ "Precisions": [ { "int4": "", - "int8": 20.17, + "int8": 1.1, "fp16": "", "fp32": "", "bf16": "" @@ -6405,19 +6405,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "yolo11", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.5, - "fp16": 0.51, - "fp32": "", + "int8": "", + "fp16": "", + "fp32": 185.48, "bf16": "" } ], @@ -6428,7 +6428,7 @@ "Precisions": [ { "int4": "", - "int8": 1499.27, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -6440,19 +6440,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i9-13900K CPU-only", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Core™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 768.31, - "fp16": 485.7, - "fp32": "", + "int8": 389.46, + "fp16": "", + "fp32": 155.36, "bf16": "" } ], @@ -6463,7 +6463,7 @@ "Precisions": [ { "int4": "", - "int8": 1.7, + "int8": 4.07, "fp16": "", "fp32": "", "bf16": "" @@ -6475,19 +6475,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "resnet-50", + "Platform": "Intel® Processor N100 CPU+iGPU", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Atom™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 208.55, - "fp16": 117.84, - "fp32": "", + "int8": 36.73, + "fp16": "", + "fp32": 27.62, "bf16": "" } ], @@ -6498,7 +6498,7 @@ "Precisions": [ { "int4": "", - "int8": 5.0, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -6510,19 +6510,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Processor N100 CPU+iGPU", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Atom™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 5.64, - "fp16": 2.72, - "fp32": "", + "int8": 486.76, + "fp16": "", + "fp32": 276.51, "bf16": "" } ], @@ -6533,7 +6533,7 @@ "Precisions": [ { "int4": "", - "int8": 172.69, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -6545,19 +6545,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Processor N100 CPU+iGPU", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Atom™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 382.92, - "fp16": 223.39, - "fp32": "", + "int8": 111.75, + "fp16": "", + "fp32": 42.11, "bf16": "" } ], @@ -6568,7 +6568,7 @@ "Precisions": [ { "int4": "", - "int8": 3.11, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -6580,19 +6580,19 @@ } }, { - "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Processor N100 CPU+iGPU", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Atom™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 126.83, - "fp16": 77.91, - "fp32": "", + "int8": 2.03, + "fp16": "", + "fp32": 0.59, "bf16": "" } ], @@ -6603,7 +6603,7 @@ "Precisions": [ { "int4": "", - "int8": 8.1, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -6615,19 +6615,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "bert-base-cased", + "Platform": "Intel® Processor N100 CPU+iGPU", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Atom™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 49.68, + "int8": 216.17, "fp16": "", - "fp32": 26.85, + "fp32": 94.47, "bf16": "" } ], @@ -6650,19 +6650,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "efficientdet-d0", + "Platform": "Intel® Processor N100 CPU+iGPU", + "Model": "yolo11", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": true, + "PlatformType": "Intel® Atom™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 73.94, + "int8": "", "fp16": "", - "fp32": 48.63, + "fp32": 34.22, "bf16": "" } ], @@ -6685,19 +6685,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Processor N100 CPU+iGPU", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Atom™, CPU+iGPU", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.69, + "int8": 60.9, "fp16": "", - "fp32": 0.3, + "fp32": 28.4, "bf16": "" } ], @@ -6719,20 +6719,20 @@ } } }, - { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "mobilenet-v2", + { + "Platform": "Intel® Processor N100 iGPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Atom™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1050.26, - "fp16": "", - "fp32": 535.0, + "int8": 33.68, + "fp16": 30.76, + "fp32": "", "bf16": "" } ], @@ -6743,7 +6743,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 38.43, "fp16": "", "fp32": "", "bf16": "" @@ -6755,19 +6755,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "resnet-50", + "Platform": "Intel® Processor N100 iGPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Atom™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 234.19, - "fp16": "", - "fp32": 87.89, + "int8": 338.82, + "fp16": 267.31, + "fp32": "", "bf16": "" } ], @@ -6778,7 +6778,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.87, "fp16": "", "fp32": "", "bf16": "" @@ -6790,19 +6790,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Processor N100 iGPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Atom™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 4.74, - "fp16": "", - "fp32": 1.74, + "int8": 81.79, + "fp16": 49.77, + "fp32": "", "bf16": "" } ], @@ -6813,7 +6813,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 13.19, "fp16": "", "fp32": "", "bf16": "" @@ -6825,19 +6825,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Processor N100 iGPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Atom™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 466.65, - "fp16": "", - "fp32": 188.83, + "int8": 1.62, + "fp16": 1.01, + "fp32": "", "bf16": "" } ], @@ -6848,7 +6848,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 623.09, "fp16": "", "fp32": "", "bf16": "" @@ -6860,19 +6860,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", - "Model": "yolo11", + "Platform": "Intel® Processor N100 iGPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": false, + "PlatformType": "Intel® Atom™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 65.34, + "int8": 164.45, + "fp16": 106.87, + "fp32": "", "bf16": "" } ], @@ -6883,7 +6883,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 7.39, "fp16": "", "fp32": "", "bf16": "" @@ -6895,19 +6895,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU+iGPU", + "Platform": "Intel® Processor N100 iGPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Atom™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 125.18, - "fp16": "", - "fp32": 58.13, + "int8": 47.02, + "fp16": 34.98, + "fp32": "", "bf16": "" } ], @@ -6918,7 +6918,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 23.09, "fp16": "", "fp32": "", "bf16": "" @@ -6930,19 +6930,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Processor N100 CPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Atom™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 39.97, + "int8": 15.43, "fp16": "", - "fp32": 15.97, + "fp32": 12.72, "bf16": "" } ], @@ -6953,7 +6953,7 @@ "Precisions": [ { "int4": "", - "int8": 40.14, + "int8": 66.37, "fp16": "", "fp32": "", "bf16": "" @@ -6965,19 +6965,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Processor N100 CPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Atom™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 56.15, + "int8": 296.05, "fp16": "", - "fp32": 35.76, + "fp32": 182.7, "bf16": "" } ], @@ -6988,7 +6988,7 @@ "Precisions": [ { "int4": "", - "int8": 28.73, + "int8": 3.82, "fp16": "", "fp32": "", "bf16": "" @@ -7000,19 +7000,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Processor N100 CPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Atom™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.57, + "int8": 48.69, "fp16": "", - "fp32": 0.16, + "fp32": 20.16, "bf16": "" } ], @@ -7023,7 +7023,7 @@ "Precisions": [ { "int4": "", - "int8": 2069.28, + "int8": 21.9, "fp16": "", "fp32": "", "bf16": "" @@ -7035,19 +7035,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Processor N100 CPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Atom™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 951.93, + "int8": 0.81, "fp16": "", - "fp32": 463.06, + "fp32": 0.31, "bf16": "" } ], @@ -7058,7 +7058,7 @@ "Precisions": [ { "int4": "", - "int8": 1.74, + "int8": 1223.75, "fp16": "", "fp32": "", "bf16": "" @@ -7070,19 +7070,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Processor N100 CPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Atom™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 184.54, + "int8": 105.91, "fp16": "", - "fp32": 52.88, + "fp32": 49.22, "bf16": "" } ], @@ -7093,7 +7093,7 @@ "Precisions": [ { "int4": "", - "int8": 9.61, + "int8": 9.72, "fp16": "", "fp32": "", "bf16": "" @@ -7105,19 +7105,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Processor N100 CPU-only", + "Model": "yolo11", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Atom™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3.16, + "int8": "", "fp16": "", - "fp32": 0.92, + "fp32": 15.38, "bf16": "" } ], @@ -7128,7 +7128,7 @@ "Precisions": [ { "int4": "", - "int8": 466.34, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -7140,19 +7140,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Processor N100 CPU-only", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Atom™, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 383.62, + "int8": 23.61, "fp16": "", - "fp32": 134.93, + "fp32": 12.8, "bf16": "" } ], @@ -7163,7 +7163,7 @@ "Precisions": [ { "int4": "", - "int8": 4.16, + "int8": 43.43, "fp16": "", "fp32": "", "bf16": "" @@ -7175,19 +7175,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "yolo11", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "bert-base-cased", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "whats_new_model": false, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 220.43, "fp16": "", - "fp32": 43.64, + "fp32": 80.48, "bf16": "" } ], @@ -7198,7 +7198,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 14.26, "fp16": "", "fp32": "", "bf16": "" @@ -7210,19 +7210,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 91.3, + "int8": 271.82, "fp16": "", - "fp32": 36.39, + "fp32": 167.4, "bf16": "" } ], @@ -7233,7 +7233,7 @@ "Precisions": [ { "int4": "", - "int8": 18.15, + "int8": 11.27, "fp16": "", "fp32": "", "bf16": "" @@ -7245,19 +7245,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 47.17, - "fp16": 39.79, - "fp32": "", + "int8": 3.27, + "fp16": "", + "fp32": 0.91, "bf16": "" } ], @@ -7268,7 +7268,7 @@ "Precisions": [ { "int4": "", - "int8": 18.45, + "int8": 636.88, "fp16": "", "fp32": "", "bf16": "" @@ -7280,19 +7280,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 80.6, - "fp16": 59.92, - "fp32": "", + "int8": 5423.43, + "fp16": "", + "fp32": 1935.82, "bf16": "" } ], @@ -7303,7 +7303,7 @@ "Precisions": [ { "int4": "", - "int8": 14.61, + "int8": 1.5, "fp16": "", "fp32": "", "bf16": "" @@ -7315,19 +7315,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.52, - "fp16": 0.58, - "fp32": "", + "int8": 974.84, + "fp16": "", + "fp32": 269.18, "bf16": "" } ], @@ -7338,7 +7338,7 @@ "Precisions": [ { "int4": "", - "int8": 1506.76, + "int8": 3.11, "fp16": "", "fp32": "", "bf16": "" @@ -7350,19 +7350,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 778.4, - "fp16": 509.56, - "fp32": "", + "int8": 17.65, + "fp16": "", + "fp32": 4.59, "bf16": "" } ], @@ -7373,7 +7373,7 @@ "Precisions": [ { "int4": "", - "int8": 1.48, + "int8": 116.16, "fp16": "", "fp32": "", "bf16": "" @@ -7385,19 +7385,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", - "Model": "resnet-50", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 225.12, - "fp16": 127.27, - "fp32": "", + "int8": 2101.22, + "fp16": "", + "fp32": 639.42, "bf16": "" } ], @@ -7408,7 +7408,7 @@ "Precisions": [ { "int4": "", - "int8": 4.31, + "int8": 1.6, "fp16": "", "fp32": "", "bf16": "" @@ -7420,54 +7420,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", + "Model": "yolo11", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 5.79, - "fp16": 2.86, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 144.71, + "int8": "", "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 404.76, - "fp16": 237.61, - "fp32": "", + "fp32": 206.74, "bf16": "" } ], @@ -7478,7 +7443,7 @@ "Precisions": [ { "int4": "", - "int8": 2.75, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -7490,19 +7455,19 @@ } }, { - "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", + "Platform": "Intel® Xeon® Gold 5218T CPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 131.89, - "fp16": 83.17, - "fp32": "", + "int8": 440.68, + "fp16": "", + "fp32": 173.86, "bf16": "" } ], @@ -7513,7 +7478,7 @@ "Precisions": [ { "int4": "", - "int8": 7.11, + "int8": 6.0, "fp16": "", "fp32": "", "bf16": "" @@ -7525,19 +7490,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 120.44, + "int8": 425.03, "fp16": "", - "fp32": 47.21, + "fp32": 163.81, "bf16": "" } ], @@ -7548,7 +7513,7 @@ "Precisions": [ { "int4": "", - "int8": 13.32, + "int8": 11.11, "fp16": "", "fp32": "", "bf16": "" @@ -7560,19 +7525,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 148.91, + "int8": 411.76, "fp16": "", - "fp32": 93.08, + "fp32": 254.31, "bf16": "" } ], @@ -7583,7 +7548,7 @@ "Precisions": [ { "int4": "", - "int8": 9.22, + "int8": 8.6, "fp16": "", "fp32": "", "bf16": "" @@ -7595,19 +7560,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 6.56, "fp16": "", - "fp32": 0.49, + "fp32": 1.64, "bf16": "" } ], @@ -7618,7 +7583,7 @@ "Precisions": [ { "int4": "", - "int8": 733.91, + "int8": 324.41, "fp16": "", "fp32": "", "bf16": "" @@ -7630,19 +7595,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2974.41, + "int8": 10309.85, "fp16": "", - "fp32": 1317.04, + "fp32": 3324.77, "bf16": "" } ], @@ -7653,7 +7618,7 @@ "Precisions": [ { "int4": "", - "int8": 0.69, + "int8": 1.18, "fp16": "", "fp32": "", "bf16": "" @@ -7665,19 +7630,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 537.98, + "int8": 2132.35, "fp16": "", - "fp32": 148.85, + "fp32": 569.56, "bf16": "" } ], @@ -7688,7 +7653,7 @@ "Precisions": [ { "int4": "", - "int8": 2.82, + "int8": 1.85, "fp16": "", "fp32": "", "bf16": "" @@ -7700,19 +7665,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 8.8, + "int8": 41.73, "fp16": "", - "fp32": 2.47, + "fp32": 10.88, "bf16": "" } ], @@ -7723,7 +7688,7 @@ "Precisions": [ { "int4": "", - "int8": 133.73, + "int8": 49.6, "fp16": "", "fp32": "", "bf16": "" @@ -7735,19 +7700,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1068.19, + "int8": 4373.09, "fp16": "", - "fp32": 379.85, + "fp32": 1243.88, "bf16": "" } ], @@ -7758,7 +7723,7 @@ "Precisions": [ { "int4": "", - "int8": 1.33, + "int8": 1.28, "fp16": "", "fp32": "", "bf16": "" @@ -7770,11 +7735,11 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "yolo11", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -7782,7 +7747,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": 122.62, + "fp32": 383.5, "bf16": "" } ], @@ -7805,19 +7770,19 @@ } }, { - "Platform": "Intel® Core™ i5-13600K CPU-only", + "Platform": "Intel® Xeon® Gold 6238L CPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 266.57, + "int8": 736.96, "fp16": "", - "fp32": 102.14, + "fp32": 338.37, "bf16": "" } ], @@ -7828,7 +7793,7 @@ "Precisions": [ { "int4": "", - "int8": 5.27, + "int8": 4.23, "fp16": "", "fp32": "", "bf16": "" @@ -7840,19 +7805,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 84.71, + "int8": 622.89, "fp16": "", - "fp32": 51.06, + "fp32": 241.06, "bf16": "" } ], @@ -7863,7 +7828,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 6.67, "fp16": "", "fp32": "", "bf16": "" @@ -7875,19 +7840,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 98.02, + "int8": 722.69, "fp16": "", - "fp32": 65.51, + "fp32": 422.11, "bf16": "" } ], @@ -7898,7 +7863,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 4.91, "fp16": "", "fp32": "", "bf16": "" @@ -7910,19 +7875,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1.16, + "int8": 10.35, "fp16": "", - "fp32": 0.64, + "fp32": 2.43, "bf16": "" } ], @@ -7933,7 +7898,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 226.02, "fp16": "", "fp32": "", "bf16": "" @@ -7945,19 +7910,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1353.32, + "int8": 16483.42, "fp16": "", - "fp32": 683.15, + "fp32": 5183.13, "bf16": "" } ], @@ -7968,7 +7933,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.6, "fp16": "", "fp32": "", "bf16": "" @@ -7980,19 +7945,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 365.63, + "int8": 3360.26, "fp16": "", - "fp32": 164.12, + "fp32": 826.72, "bf16": "" } ], @@ -8003,7 +7968,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.35, "fp16": "", "fp32": "", "bf16": "" @@ -8015,19 +7980,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 8.65, + "int8": 60.79, "fp16": "", - "fp32": 3.77, + "fp32": 15.05, "bf16": "" } ], @@ -8038,7 +8003,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 36.9, "fp16": "", "fp32": "", "bf16": "" @@ -8050,19 +8015,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 657.26, + "int8": 6964.97, "fp16": "", - "fp32": 293.93, + "fp32": 1758.94, "bf16": "" } ], @@ -8073,7 +8038,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.77, "fp16": "", "fp32": "", "bf16": "" @@ -8085,11 +8050,11 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "yolo11", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -8097,7 +8062,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": 107.24, + "fp32": 570.41, "bf16": "" } ], @@ -8120,19 +8085,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU+iGPU", + "Platform": "Intel® Xeon® Gold 6338N CPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 182.9, + "int8": 1225.71, "fp16": "", - "fp32": 101.97, + "fp32": 494.95, "bf16": "" } ], @@ -8143,7 +8108,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.02, "fp16": "", "fp32": "", "bf16": "" @@ -8155,19 +8120,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 50.21, + "int8": 590.91, "fp16": "", - "fp32": 18.33, + "fp32": 225.85, "bf16": "" } ], @@ -8178,7 +8143,7 @@ "Precisions": [ { "int4": "", - "int8": 22.66, + "int8": 9.06, "fp16": "", "fp32": "", "bf16": "" @@ -8190,19 +8155,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 71.27, + "int8": 583.69, "fp16": "", - "fp32": 41.39, + "fp32": 344.05, "bf16": "" } ], @@ -8213,7 +8178,7 @@ "Precisions": [ { "int4": "", - "int8": 14.62, + "int8": 6.88, "fp16": "", "fp32": "", "bf16": "" @@ -8225,19 +8190,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.71, + "int8": 8.61, "fp16": "", - "fp32": 0.19, + "fp32": 2.26, "bf16": "" } ], @@ -8248,7 +8213,7 @@ "Precisions": [ { "int4": "", - "int8": 1361.21, + "int8": 251.68, "fp16": "", "fp32": "", "bf16": "" @@ -8260,19 +8225,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1291.06, + "int8": 14974.1, "fp16": "", - "fp32": 507.09, + "fp32": 4631.52, "bf16": "" } ], @@ -8283,7 +8248,7 @@ "Precisions": [ { "int4": "", - "int8": 0.95, + "int8": 0.93, "fp16": "", "fp32": "", "bf16": "" @@ -8295,19 +8260,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 224.68, + "int8": 2951.09, "fp16": "", - "fp32": 60.81, + "fp32": 758.44, "bf16": "" } ], @@ -8318,7 +8283,7 @@ "Precisions": [ { "int4": "", - "int8": 4.95, + "int8": 1.58, "fp16": "", "fp32": "", "bf16": "" @@ -8330,19 +8295,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3.84, + "int8": 57.9, "fp16": "", - "fp32": 1.01, + "fp32": 14.99, "bf16": "" } ], @@ -8353,7 +8318,7 @@ "Precisions": [ { "int4": "", - "int8": 250.45, + "int8": 37.3, "fp16": "", "fp32": "", "bf16": "" @@ -8365,19 +8330,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 491.99, + "int8": 6130.79, "fp16": "", - "fp32": 146.3, + "fp32": 1659.32, "bf16": "" } ], @@ -8388,7 +8353,7 @@ "Precisions": [ { "int4": "", - "int8": 2.2, + "int8": 1.19, "fp16": "", "fp32": "", "bf16": "" @@ -8400,11 +8365,11 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "yolo11", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -8412,7 +8377,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": 48.0, + "fp32": 512.68, "bf16": "" } ], @@ -8435,19 +8400,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 CPU-only", + "Platform": "Intel® Xeon® Platinum 8280 CPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 106.45, + "int8": 1008.45, "fp16": "", - "fp32": 40.14, + "fp32": 451.77, "bf16": "" } ], @@ -8458,7 +8423,7 @@ "Precisions": [ { "int4": "", - "int8": 10.2, + "int8": 3.6, "fp16": "", "fp32": "", "bf16": "" @@ -8470,19 +8435,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 68.4, - "fp16": 53.22, - "fp32": "", + "int8": 877.5, + "fp16": "", + "fp32": 337.07, "bf16": "" } ], @@ -8493,7 +8458,7 @@ "Precisions": [ { "int4": "", - "int8": 17.09, + "int8": 5.18, "fp16": "", "fp32": "", "bf16": "" @@ -8505,19 +8470,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 91.46, - "fp16": 72.22, - "fp32": "", + "int8": 1004.95, + "fp16": "", + "fp32": 577.33, "bf16": "" } ], @@ -8528,7 +8493,7 @@ "Precisions": [ { "int4": "", - "int8": 17.92, + "int8": 4.33, "fp16": "", "fp32": "", "bf16": "" @@ -8540,31 +8505,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "gemma-2-9b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 0.82, - "fp16": 0.88, + "int4": 20.93, + "int8": 14.24, + "fp16": 7.69, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1113.84, - "fp16": "", + "int4": 47.77, + "int8": 70.2, + "fp16": 130.02, "fp32": "", "bf16": "" } @@ -8575,31 +8540,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "glm-4-9b-chat", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "whats_new_model": "false", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 729.72, - "fp16": 569.2, + "int4": 23.5, + "int8": 15.51, + "fp16": 8.49, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 2.05, - "fp16": "", + "int4": 42.54, + "int8": 64.45, + "fp16": 117.72, "fp32": "", "bf16": "" } @@ -8610,31 +8575,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", - "Model": "resnet-50", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "llama-2-7b-chat", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 262.94, - "fp16": 174.98, + "int4": 26.42, + "int8": 19.3, + "fp16": 10.54, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 4.82, - "fp16": "", + "int4": 37.84, + "int8": 51.81, + "fp16": 94.85, "fp32": "", "bf16": "" } @@ -8645,31 +8610,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "llama-3-8b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "whats_new_model": "false", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 8.29, - "fp16": 4.67, + "int4": 26.48, + "int8": 17.82, + "fp16": 9.62, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 118.28, - "fp16": "", + "int4": 37.76, + "int8": 56.1, + "fp16": 103.89, "fp32": "", "bf16": "" } @@ -8680,31 +8645,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "llama-3.2-3b-instruct", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 447.59, - "fp16": 299.29, + "int4": 49.95, + "int8": 34.9, + "fp16": 19.58, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 3.33, - "fp16": "", + "int4": 20.02, + "int8": 28.65, + "fp16": 51.05, "fp32": "", "bf16": "" } @@ -8715,19 +8680,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185G7 iGPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 161.26, - "fp16": 111.45, - "fp32": "", + "int8": 14.28, + "fp16": "", + "fp32": 3.4, "bf16": "" } ], @@ -8738,7 +8703,7 @@ "Precisions": [ { "int4": "", - "int8": 8.1, + "int8": 172.99, "fp16": "", "fp32": "", "bf16": "" @@ -8750,31 +8715,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", - "Model": "bert-base-cased", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "mistral-7b-v0.1", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 50.01, - "fp16": "", - "fp32": 25.82, + "int4": 27.2, + "int8": 18.7, + "fp16": 10.15, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 36.76, + "int8": 53.45, + "fp16": 98.43, "fp32": "", "bf16": "" } @@ -8785,19 +8750,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", - "Model": "efficientdet-d0", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 57.69, + "int8": 22593.46, "fp16": "", - "fp32": 28.41, + "fp32": 6937.81, "bf16": "" } ], @@ -8808,7 +8773,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.58, "fp16": "", "fp32": "", "bf16": "" @@ -8820,31 +8785,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "phi-3-mini-4k-instruct", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 0.69, - "fp16": "", + "int4": 40.17, + "int8": 30.4, + "fp16": 17.19, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 24.89, + "int8": 32.89, + "fp16": 58.17, "fp32": "", "bf16": "" } @@ -8855,31 +8820,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", - "Model": "mobilenet-v2", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", + "Model": "qwen2-7b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 958.94, - "fp16": "", - "fp32": 350.53, + "int4": 29.5, + "int8": 19.45, + "fp16": 10.14, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 33.89, + "int8": 51.39, + "fp16": 98.53, "fp32": "", "bf16": "" } @@ -8890,19 +8855,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 230.4, + "int8": 4874.83, "fp16": "", - "fp32": 85.03, + "fp32": 1145.82, "bf16": "" } ], @@ -8913,7 +8878,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.07, "fp16": "", "fp32": "", "bf16": "" @@ -8925,19 +8890,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 4.44, + "int8": 84.49, "fp16": "", - "fp32": 1.75, + "fp32": 20.86, "bf16": "" } ], @@ -8948,7 +8913,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 26.86, "fp16": "", "fp32": "", "bf16": "" @@ -8960,19 +8925,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 456.16, + "int8": 10144.91, "fp16": "", - "fp32": 162.16, + "fp32": 2524.18, "bf16": "" } ], @@ -8983,7 +8948,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.69, "fp16": "", "fp32": "", "bf16": "" @@ -8995,11 +8960,11 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", "Model": "yolo11", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU+iGPU", + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -9007,7 +8972,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": 55.98, + "fp32": 803.23, "bf16": "" } ], @@ -9030,19 +8995,19 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8380 CPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 103.63, + "int8": 1701.66, "fp16": "", - "fp32": 53.56, + "fp32": 696.4, "bf16": "" } ], @@ -9053,7 +9018,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 2.35, "fp16": "", "fp32": "", "bf16": "" @@ -9065,20 +9030,20 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", "Model": "bert-base-cased", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 38.28, + "int8": 3014.41, "fp16": "", - "fp32": 13.87, - "bf16": "" + "fp32": 482.59, + "bf16": 1959.08 } ], "Unit": "FPS", @@ -9088,10 +9053,10 @@ "Precisions": [ { "int4": "", - "int8": 28.41, + "int8": 3.76, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 4.83 } ], "Unit": "ms", @@ -9100,20 +9065,20 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", "Model": "efficientdet-d0", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 53.34, + "int8": 1451.22, "fp16": "", - "fp32": 22.26, - "bf16": "" + "fp32": 859.98, + "bf16": 1016.97 } ], "Unit": "FPS", @@ -9123,10 +9088,10 @@ "Precisions": [ { "int4": "", - "int8": 20.12, + "int8": 4.76, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 5.09 } ], "Unit": "ms", @@ -9135,31 +9100,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "gemma-2-9b", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 0.52, - "fp16": "", - "fp32": 0.14, + "int4": 22.73, + "int8": 16.68, + "fp16": 10.79, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1805.69, - "fp16": "", + "int4": 43.98, + "int8": 59.92, + "fp16": 92.62, "fp32": "", "bf16": "" } @@ -9170,31 +9135,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "glm-4-9b-chat", + "featured_SKU": true, + "whats_new_model": "false", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 972.25, - "fp16": "", - "fp32": 311.82, + "int4": 23.18, + "int8": 16.86, + "fp16": 11.29, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1.2, - "fp16": "", + "int4": 43.14, + "int8": 59.31, + "fp16": 88.5, "fp32": "", "bf16": "" } @@ -9205,31 +9170,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", - "Model": "resnet-50", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "llama-2-7b-chat", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 174.69, - "fp16": "", - "fp32": 45.52, + "int4": 28.22, + "int8": 20.58, + "fp16": 14.55, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 6.4, - "fp16": "", + "int4": 35.43, + "int8": 48.58, + "fp16": 68.7, "fp32": "", "bf16": "" } @@ -9240,31 +9205,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "llama-3-8b", + "featured_SKU": true, + "whats_new_model": "false", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 2.72, - "fp16": "", - "fp32": 0.78, + "int4": 26.27, + "int8": 19.11, + "fp16": 13.32, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 335.04, - "fp16": "", + "int4": 38.06, + "int8": 52.31, + "fp16": 75.04, "fp32": "", "bf16": "" } @@ -9275,31 +9240,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "llama-3.2-3b-instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 386.67, - "fp16": "", - "fp32": 99.8, + "int4": 48.35, + "int8": 38.34, + "fp16": 27.94, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 2.82, - "fp16": "", + "int4": 20.68, + "int8": 26.08, + "fp16": 35.79, "fp32": "", "bf16": "" } @@ -9310,20 +9275,20 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 61.27, "fp16": "", - "fp32": 32.19, - "bf16": "" + "fp32": 5.19, + "bf16": 37.47 } ], "Unit": "FPS", @@ -9333,10 +9298,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 60.02, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 81.99 } ], "Unit": "ms", @@ -9345,31 +9310,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE CPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "mistral-7b-v0.1", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 76.54, - "fp16": "", - "fp32": 27.6, + "int4": 28.78, + "int8": 20.01, + "fp16": 14.07, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 13.2, - "fp16": "", + "int4": 34.74, + "int8": 49.96, + "fp16": 71.05, "fp32": "", "bf16": "" } @@ -9380,20 +9345,20 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", - "Model": "bert-base-cased", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "mobilenet-v2", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 45.77, - "fp16": 40.93, - "fp32": "", - "bf16": "" + "int8": 38249.27, + "fp16": "", + "fp32": 10231.48, + "bf16": 25384.03 } ], "Unit": "FPS", @@ -9403,10 +9368,10 @@ "Precisions": [ { "int4": "", - "int8": 21.21, + "int8": 0.66, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 0.67 } ], "Unit": "ms", @@ -9415,31 +9380,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "phi-3-mini-4k-instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 56.2, - "fp16": 41.8, + "int4": 42.19, + "int8": 35.39, + "fp16": 23.71, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 23.38, - "fp16": "", + "int4": 23.7, + "int8": 28.25, + "fp16": 42.17, "fp32": "", "bf16": "" } @@ -9450,31 +9415,31 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "qwen2-7b", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 0.56, - "fp16": 0.54, + "int4": 31.0, + "int8": 22.59, + "fp16": 14.26, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1606.31, - "fp16": "", + "int4": 32.25, + "int8": 44.25, + "fp16": 70.1, "fp32": "", "bf16": "" } @@ -9485,195 +9450,20 @@ } }, { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 648.66, - "fp16": 431.47, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 1.76, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", "Model": "resnet-50", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 208.21, - "fp16": 122.24, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 5.47, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 5.71, - "fp16": 3.09, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 173.5, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 348.95, - "fp16": 224.45, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 3.56, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-1185GRE iGPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 113.89, - "fp16": 78.71, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 9.49, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "bert-base-cased", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 111.58, + "int8": 19160.66, "fp16": "", - "fp32": 57.55, - "bf16": "" + "fp32": 1591.64, + "bf16": 7474.81 } ], "Unit": "FPS", @@ -9683,10 +9473,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.01, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 1.26 } ], "Unit": "ms", @@ -9695,20 +9485,20 @@ } }, { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "ssd-resnet34-1200", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 141.13, + "int8": 432.65, "fp16": "", - "fp32": 75.23, - "bf16": "" + "fp32": 30.54, + "bf16": 208.46 } ], "Unit": "FPS", @@ -9718,10 +9508,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 8.99, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 15.32 } ], "Unit": "ms", @@ -9730,20 +9520,20 @@ } }, { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1.63, + "int8": 24068.49, "fp16": "", - "fp32": 0.68, - "bf16": "" + "fp32": 3408.57, + "bf16": 12163.6 } ], "Unit": "FPS", @@ -9753,10 +9543,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.76, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 0.9 } ], "Unit": "ms", @@ -9765,55 +9555,20 @@ } }, { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", + "Model": "yolo11", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 2287.47, - "fp16": "", - "fp32": 1150.08, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { "Precisions": [ { "int4": "", "int8": "", "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "resnet-50", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 532.56, - "fp16": "", - "fp32": 180.65, - "bf16": "" + "fp32": 1039.51, + "bf16": 2046.23 } ], "Unit": "FPS", @@ -9826,7 +9581,7 @@ "int8": "", "fp16": "", "fp32": "", - "bf16": "" + "bf16": 2.93 } ], "Unit": "ms", @@ -9835,755 +9590,20 @@ } }, { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 10.33, - "fp16": "", - "fp32": 3.81, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": "", - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 1013.57, - "fp16": "", - "fp32": 403.5, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": "", - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU+iGPU", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": "", - "fp16": "", - "fp32": 133.88, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": "", - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU+iGPU", - "Model": "yolo_v8n", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 268.57, - "fp16": "", - "fp32": 120.55, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": "", - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "bert-base-cased", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 87.88, - "fp16": "", - "fp32": 34.76, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 16.26, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 113.82, - "fp16": "", - "fp32": 62.45, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 11.46, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 1.27, - "fp16": "", - "fp32": 0.36, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 886.78, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 1982.75, - "fp16": "", - "fp32": 968.72, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 0.89, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "resnet-50", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 429.58, - "fp16": "", - "fp32": 107.58, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 3.47, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 7.11, - "fp16": "", - "fp32": 1.96, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 159.25, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 854.13, - "fp16": "", - "fp32": 289.32, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 1.72, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": "", - "fp16": "", - "fp32": 90.72, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": "", - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H CPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 206.32, - "fp16": "", - "fp32": 78.09, - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 6.49, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", - "Model": "bert-base-cased", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 89.81, - "fp16": 69.99, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 12.71, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 128.07, - "fp16": 97.39, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 12.87, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 1.04, - "fp16": 1.15, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 972.87, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 1281.93, - "fp16": 912.69, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 1.08, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", - "Model": "resnet-50", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 381.27, - "fp16": 226.42, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 3.22, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 10.47, - "fp16": 6.14, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 100.17, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 744.92, - "fp16": 407.72, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 1.87, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", "Model": "yolo_v8n", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": 215.67, - "fp16": 148.01, - "fp32": "", - "bf16": "" - } - ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 5.58, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "bert-base-cased", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 61.33, + "int8": 2379.54, "fp16": "", - "fp32": 32.27, - "bf16": "" + "fp32": 948.88, + "bf16": 2377.59 } ], "Unit": "FPS", @@ -10593,10 +9613,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.15, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 2.54 } ], "Unit": "ms", @@ -10605,20 +9625,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "bert-base-cased", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 88.48, + "int8": 4674.83, "fp16": "", - "fp32": 59.03, - "bf16": "" + "fp32": 560.52, + "bf16": 3250.44 } ], "Unit": "FPS", @@ -10628,10 +9648,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.63, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 4.47 } ], "Unit": "ms", @@ -10640,20 +9660,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "efficientdet-d0", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.81, + "int8": 1730.17, "fp16": "", - "fp32": 0.43, - "bf16": "" + "fp32": 1134.1, + "bf16": 1410.09 } ], "Unit": "FPS", @@ -10663,10 +9683,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 4.81, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 4.6 } ], "Unit": "ms", @@ -10675,31 +9695,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "gemma-2-9b", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 1218.37, - "fp16": "", - "fp32": 644.91, + "int4": 26.7, + "int8": 19.39, + "fp16": 12.28, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 37.45, + "int8": 51.57, + "fp16": 81.41, "fp32": "", "bf16": "" } @@ -10710,31 +9730,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "resnet-50", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "glm-4-9b-chat", + "featured_SKU": true, + "whats_new_model": "false", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 284.91, - "fp16": "", - "fp32": 109.93, + "int4": 27.89, + "int8": 19.7, + "fp16": 12.99, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 35.85, + "int8": 50.74, + "fp16": 76.94, "fp32": "", "bf16": "" } @@ -10745,31 +9765,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "llama-2-7b-chat", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 5.67, - "fp16": "", - "fp32": 2.15, + "int4": 34.45, + "int8": 24.47, + "fp16": 16.77, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 29.02, + "int8": 40.86, + "fp16": 59.6, "fp32": "", "bf16": "" } @@ -10780,31 +9800,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "llama-3-8b", + "featured_SKU": true, + "whats_new_model": "false", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 554.73, - "fp16": "", - "fp32": 228.8, + "int4": 31.52, + "int8": 22.17, + "fp16": 15.17, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 31.72, + "int8": 45.1, + "fp16": 65.89, "fp32": "", "bf16": "" } @@ -10815,31 +9835,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU+iGPU", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "llama-3.2-3b-instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", - "fp32": 80.32, + "int4": 58.68, + "int8": 44.62, + "fp16": 32.87, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 17.04, + "int8": 22.41, + "fp16": 30.42, "fp32": "", "bf16": "" } @@ -10850,20 +9870,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU+iGPU", - "Model": "yolo_v8n", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU+iGPU", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 154.56, + "int8": 75.46, "fp16": "", - "fp32": 72.19, - "bf16": "" + "fp32": 6.42, + "bf16": 48.12 } ], "Unit": "FPS", @@ -10873,10 +9893,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 55.8, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 73.33 } ], "Unit": "ms", @@ -10885,31 +9905,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", - "Model": "bert-base-cased", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "mistral-7b-v0.1", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 44.62, - "fp16": "", - "fp32": 17.96, + "int4": 33.88, + "int8": 23.05, + "fp16": 15.99, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 37.64, - "fp16": "", + "int4": 29.51, + "int8": 43.37, + "fp16": 62.53, "fp32": "", "bf16": "" } @@ -10920,20 +9940,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", - "Model": "efficientdet-d0", - "featured_SKU": false, + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "mobilenet-v2", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 61.85, + "int8": 39819.69, "fp16": "", - "fp32": 39.52, - "bf16": "" + "fp32": 15869.97, + "bf16": 29293.16 } ], "Unit": "FPS", @@ -10943,10 +9963,10 @@ "Precisions": [ { "int4": "", - "int8": 26.95, + "int8": 0.66, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 0.67 } ], "Unit": "ms", @@ -10955,31 +9975,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "phi-3-mini-4k-instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 0.64, - "fp16": "", - "fp32": 0.17, + "int4": 50.96, + "int8": 41.58, + "fp16": 27.67, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1935.64, - "fp16": "", + "int4": 19.62, + "int8": 24.05, + "fp16": 36.14, "fp32": "", "bf16": "" } @@ -10990,31 +10010,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", + "Model": "qwen2-7b", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 1042.94, - "fp16": "", - "fp32": 515.99, + "int4": 37.77, + "int8": 26.53, + "fp16": 16.25, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1.61, - "fp16": "", + "int4": 26.47, + "int8": 37.69, + "fp16": 61.52, "fp32": "", "bf16": "" } @@ -11025,20 +10045,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", "Model": "resnet-50", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 203.02, + "int8": 21640.96, "fp16": "", - "fp32": 59.12, - "bf16": "" + "fp32": 1998.64, + "bf16": 13585.61 } ], "Unit": "FPS", @@ -11048,10 +10068,10 @@ "Precisions": [ { "int4": "", - "int8": 9.0, + "int8": 1.0, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 1.21 } ], "Unit": "ms", @@ -11060,20 +10080,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3.48, + "int8": 510.69, "fp16": "", - "fp32": 1.03, - "bf16": "" + "fp32": 35.18, + "bf16": 273.78 } ], "Unit": "FPS", @@ -11083,10 +10103,10 @@ "Precisions": [ { "int4": "", - "int8": 439.19, + "int8": 7.68, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 12.33 } ], "Unit": "ms", @@ -11095,20 +10115,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 422.9, + "int8": 26761.66, "fp16": "", - "fp32": 151.69, - "bf16": "" + "fp32": 4711.32, + "bf16": 16670.32 } ], "Unit": "FPS", @@ -11118,10 +10138,10 @@ "Precisions": [ { "int4": "", - "int8": 3.87, + "int8": 0.72, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 1.16 } ], "Unit": "ms", @@ -11130,11 +10150,11 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -11142,8 +10162,8 @@ "int4": "", "int8": "", "fp16": "", - "fp32": 48.93, - "bf16": "" + "fp32": 1455.84, + "bf16": 2965.31 } ], "Unit": "FPS", @@ -11156,7 +10176,7 @@ "int8": "", "fp16": "", "fp32": "", - "bf16": "" + "bf16": 3.11 } ], "Unit": "ms", @@ -11165,20 +10185,20 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor CPU-only", + "Platform": "Intel® Xeon® Platinum 8580 CPU-only", "Model": "yolo_v8n", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 101.73, + "int8": 3045.87, "fp16": "", - "fp32": 40.76, - "bf16": "" + "fp32": 1259.07, + "bf16": 3431.21 } ], "Unit": "FPS", @@ -11188,10 +10208,10 @@ "Precisions": [ { "int4": "", - "int8": 16.99, + "int8": 3.06, "fp16": "", "fp32": "", - "bf16": "" + "bf16": 2.55 } ], "Unit": "ms", @@ -11200,18 +10220,18 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "bert-base-cased", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 67.08, - "fp16": 52.9, + "int8": 314.28, + "fp16": 348.05, "fp32": "", "bf16": "" } @@ -11223,7 +10243,7 @@ "Precisions": [ { "int4": "", - "int8": 14.38, + "int8": 4.87, "fp16": "", "fp32": "", "bf16": "" @@ -11235,18 +10255,18 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "efficientdet-d0", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 98.8, - "fp16": 73.53, + "int8": 329.65, + "fp16": 284.77, "fp32": "", "bf16": "" } @@ -11258,7 +10278,7 @@ "Precisions": [ { "int4": "", - "int8": 13.41, + "int8": 5.42, "fp16": "", "fp32": "", "bf16": "" @@ -11270,30 +10290,30 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "gemma-2-9b", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 6.21, - "int8": 3.88, + "int4": 20.28, + "int8": 17.7, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 160.82, - "int8": 257.32, + "int4": 49.3, + "int8": 56.48, "fp16": "", "fp32": "", "bf16": "" @@ -11305,30 +10325,30 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "glm-4-9b-chat", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": "false", - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 7.25, - "int8": 4.27, + "int4": 37.32, + "int8": 28.17, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 137.82, - "int8": 233.92, + "int4": 26.79, + "int8": 35.49, "fp16": "", "fp32": "", "bf16": "" @@ -11340,31 +10360,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "llama-2-7b-chat", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 8.53, - "int8": 5.74, - "fp16": "", + "int4": 42.77, + "int8": 33.5, + "fp16": 22.41, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 117.18, - "int8": 174.01, - "fp16": "", + "int4": 23.38, + "int8": 29.85, + "fp16": 44.61, "fp32": "", "bf16": "" } @@ -11375,30 +10395,30 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "llama-3-8b", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": "false", - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 8.49, - "int8": 5.06, + "int4": 40.04, + "int8": 30.94, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 117.69, - "int8": 197.3, + "int4": 24.97, + "int8": 32.32, "fp16": "", "fp32": "", "bf16": "" @@ -11410,31 +10430,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "llama-3.2-3b-instruct", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 16.79, - "int8": 11.89, - "fp16": 6.7, + "int4": 56.52, + "int8": 52.46, + "fp16": 36.06, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 59.54, - "int8": 84.05, - "fp16": 149.13, + "int4": 17.69, + "int8": 19.06, + "fp16": 27.73, "fp32": "", "bf16": "" } @@ -11445,18 +10465,18 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.73, - "fp16": 0.77, + "int8": 34.83, + "fp16": 19.44, "fp32": "", "bf16": "" } @@ -11468,7 +10488,7 @@ "Precisions": [ { "int4": "", - "int8": 1191.59, + "int8": 47.58, "fp16": "", "fp32": "", "bf16": "" @@ -11480,30 +10500,30 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "mistral-7b-v0.1", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 8.86, - "int8": 5.44, + "int4": 43.17, + "int8": 32.07, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 112.76, - "int8": 183.5, + "int4": 23.16, + "int8": 31.18, "fp16": "", "fp32": "", "bf16": "" @@ -11515,18 +10535,18 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "mobilenet-v2", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 869.88, - "fp16": 621.94, + "int8": 2380.28, + "fp16": 2106.51, "fp32": "", "bf16": "" } @@ -11538,7 +10558,7 @@ "Precisions": [ { "int4": "", - "int8": 1.38, + "int8": 1.18, "fp16": "", "fp32": "", "bf16": "" @@ -11550,31 +10570,31 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "phi-3-mini-4k-instruct", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 14.78, - "int8": 9.98, - "fp16": 5.45, + "int4": 58.37, + "int8": 47.34, + "fp16": 29.17, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 67.65, - "int8": 100.19, - "fp16": 183.48, + "int4": 17.13, + "int8": 21.12, + "fp16": 34.28, "fp32": "", "bf16": "" } @@ -11585,30 +10605,30 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "qwen2-7b", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 9.11, - "int8": 5.39, + "int4": 41.51, + "int8": 33.85, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 109.74, - "int8": 185.49, + "int4": 24.09, + "int8": 29.54, "fp16": "", "fp32": "", "bf16": "" @@ -11620,18 +10640,18 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "resnet-50", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 277.06, - "fp16": 164.27, + "int8": 1407.56, + "fp16": 1035.07, "fp32": "", "bf16": "" } @@ -11643,7 +10663,7 @@ "Precisions": [ { "int4": "", - "int8": 3.85, + "int8": 1.44, "fp16": "", "fp32": "", "bf16": "" @@ -11655,18 +10675,18 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 7.1, - "fp16": 3.99, + "int8": 112.36, + "fp16": 73.09, "fp32": "", "bf16": "" } @@ -11678,7 +10698,7 @@ "Precisions": [ { "int4": "", - "int8": 126.73, + "int8": 15.0, "fp16": "", "fp32": "", "bf16": "" @@ -11690,18 +10710,18 @@ } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 484.13, - "fp16": 298.47, + "int8": 1317.17, + "fp16": 1199.61, "fp32": "", "bf16": "" } @@ -11713,65 +10733,30 @@ "Precisions": [ { "int4": "", - "int8": 2.49, - "fp16": "", - "fp32": "", - "bf16": "" - } - ], - "Unit": "ms", - "UnitDesc": "lower is better" - } - } - }, - { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", - "Model": "stable-diffusion-v1-5", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", - "Parameters": { - "throughput": { - "Precisions": [ - { - "int4": "", - "int8": "", + "int8": 1.46, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", - "UnitDesc": "higher is better" - }, - "latency": { - "Precisions": [ - { - "int4": "", - "int8": 29.54, - "fp16": 29.97, - "fp32": "", - "bf16": "" - } - ], "Unit": "ms", "UnitDesc": "lower is better" } } }, { - "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Platform": "Intel® Arc™ A-Series Graphics dGPU", "Model": "yolo_v8n", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, iGPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 162.35, - "fp16": 106.83, + "int8": 516.71, + "fp16": 551.12, "fp32": "", "bf16": "" } @@ -11783,7 +10768,7 @@ "Precisions": [ { "int4": "", - "int8": 6.38, + "int8": 3.34, "fp16": "", "fp32": "", "bf16": "" @@ -11795,19 +10780,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "bert-base-cased", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 170.14, - "fp16": "", - "fp32": 67.07, + "int8": 166.8, + "fp16": 106.26, + "fp32": "", "bf16": "" } ], @@ -11818,7 +10803,7 @@ "Precisions": [ { "int4": "", - "int8": 10.73, + "int8": 6.48, "fp16": "", "fp32": "", "bf16": "" @@ -11830,19 +10815,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "efficientdet-d0", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 219.8, - "fp16": "", - "fp32": 126.91, + "int8": 200.77, + "fp16": 162.18, + "fp32": "", "bf16": "" } ], @@ -11853,7 +10838,7 @@ "Precisions": [ { "int4": "", - "int8": 7.34, + "int8": 8.23, "fp16": "", "fp32": "", "bf16": "" @@ -11865,31 +10850,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "gemma-2-9b", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 9.43, - "int8": 6.9, - "fp16": 3.59, + "int4": 11.16, + "int8": "", + "fp16": 0.9, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 106.06, - "int8": 144.96, - "fp16": 278.42, + "int4": 89.57, + "int8": "", + "fp16": 1105.2, "fp32": "", "bf16": "" } @@ -11900,31 +10885,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "glm-4-9b-chat", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 10.66, - "int8": 7.47, - "fp16": 3.84, + "int4": 13.65, + "int8": "", + "fp16": 1.17, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 93.82, - "int8": 133.88, - "fp16": 260.67, + "int4": 73.24, + "int8": "", + "fp16": 849.49, "fp32": "", "bf16": "" } @@ -11935,31 +10920,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "llama-2-7b-chat", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 13.44, - "int8": 9.29, - "fp16": 4.94, + "int4": 15.55, + "int8": 10.59, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 74.39, - "int8": 107.62, - "fp16": 202.32, + "int4": 64.29, + "int8": 94.35, + "fp16": "", "fp32": "", "bf16": "" } @@ -11970,31 +10955,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "llama-3-8b", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 11.91, - "int8": 8.66, - "fp16": 4.81, + "int4": 15.53, + "int8": 8.75, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 83.93, - "int8": 115.49, - "fp16": 223.15, + "int4": 64.39, + "int8": 114.23, + "fp16": "", "fp32": "", "bf16": "" } @@ -12005,31 +10990,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "llama-3.2-3b-instruct", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 25.41, - "int8": 19.0, - "fp16": 10.18, + "int4": 29.61, + "int8": 20.65, + "fp16": 12.31, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 39.36, - "int8": 52.64, - "fp16": 98.24, + "int4": 33.77, + "int8": 48.42, + "fp16": 81.22, "fp32": "", "bf16": "" } @@ -12040,19 +11025,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2.49, - "fp16": "", - "fp32": 0.71, + "int8": 2.26, + "fp16": 1.57, + "fp32": "", "bf16": "" } ], @@ -12063,7 +11048,7 @@ "Precisions": [ { "int4": "", - "int8": 562.6, + "int8": 422.52, "fp16": "", "fp32": "", "bf16": "" @@ -12075,31 +11060,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "mistral-7b-v0.1", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 14.41, - "int8": 9.13, - "fp16": 4.72, + "int4": 15.86, + "int8": 10.16, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 69.4, - "int8": 109.54, - "fp16": 211.92, + "int4": 63.02, + "int8": 98.38, + "fp16": "", "fp32": "", "bf16": "" } @@ -12110,19 +11095,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "mobilenet-v2", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 4239.14, - "fp16": "", - "fp32": 2047.2, + "int8": 1303.1, + "fp16": 1365.49, + "fp32": "", "bf16": "" } ], @@ -12133,7 +11118,7 @@ "Precisions": [ { "int4": "", - "int8": 0.6, + "int8": 1.14, "fp16": "", "fp32": "", "bf16": "" @@ -12145,31 +11130,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "phi-3-mini-4k-instruct", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 15.66, - "fp16": 8.52, + "int4": 25.54, + "int8": 18.45, + "fp16": 10.44, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 63.85, - "fp16": 117.37, + "int4": 39.15, + "int8": 54.19, + "fp16": 95.78, "fp32": "", "bf16": "" } @@ -12180,31 +11165,31 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "qwen2-7b", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 13.1, - "int8": 9.24, - "fp16": 4.75, + "int4": 17.1, + "int8": 9.68, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 63.85, - "fp16": 117.37, + "int4": 58.45, + "int8": 103.26, + "fp16": "", "fp32": "", "bf16": "" } @@ -12215,19 +11200,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "resnet-50", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 762.32, - "fp16": "", - "fp32": 234.53, + "int8": 556.12, + "fp16": 402.2, + "fp32": "", "bf16": "" } ], @@ -12238,7 +11223,7 @@ "Precisions": [ { "int4": "", - "int8": 2.17, + "int8": 2.38, "fp16": "", "fp32": "", "bf16": "" @@ -12250,19 +11235,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "ssd-resnet34-1200", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 12.97, - "fp16": "", - "fp32": 3.84, + "int8": 20.62, + "fp16": 12.55, + "fp32": "", "bf16": "" } ], @@ -12273,7 +11258,7 @@ "Precisions": [ { "int4": "", - "int8": 102.02, + "int8": 47.68, "fp16": "", "fp32": "", "bf16": "" @@ -12285,19 +11270,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1606.89, - "fp16": "", - "fp32": 589.62, + "int8": 1027.05, + "fp16": 803.37, + "fp32": "", "bf16": "" } ], @@ -12308,7 +11293,7 @@ "Precisions": [ { "int4": "", - "int8": 1.08, + "int8": 1.45, "fp16": "", "fp32": "", "bf16": "" @@ -12320,11 +11305,11 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", - "Model": "stable-diffusion-v1-5", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", + "Model": "yolo11", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -12332,7 +11317,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": "", + "fp32": 299.66, "bf16": "" } ], @@ -12343,8 +11328,8 @@ "Precisions": [ { "int4": "", - "int8": 40.27, - "fp16": 39.61, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -12355,19 +11340,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Core™, CPU-only", + "Platform": "Intel® Core™ Ultra 7 processor 155H iGPU-only", + "Model": "yolo_v8n", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 187.66, + "int8": 397.54, + "fp16": 297.68, + "fp32": "", "bf16": "" } ], @@ -12378,7 +11363,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.13, "fp16": "", "fp32": "", "bf16": "" @@ -12390,19 +11375,19 @@ } }, { - "Platform": "Intel® Core™ i9-13900K CPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", + "Model": "bert-base-cased", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Core™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 389.04, - "fp16": "", - "fp32": 154.4, + "int8": 259.68, + "fp16": 271.46, + "fp32": 262.33, "bf16": "" } ], @@ -12413,7 +11398,7 @@ "Precisions": [ { "int4": "", - "int8": 4.13, + "int8": 5.05, "fp16": "", "fp32": "", "bf16": "" @@ -12425,19 +11410,19 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", - "Model": "bert-base-cased", - "featured_SKU": "false", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", + "Model": "efficientdet-d0", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 385.87, - "fp16": 420.99, - "fp32": "", + "int8": 167.07, + "fp16": 174.75, + "fp32": 178.07, "bf16": "" } ], @@ -12448,7 +11433,7 @@ "Precisions": [ { "int4": "", - "int8": 2.99, + "int8": 8.26, "fp16": "", "fp32": "", "bf16": "" @@ -12460,18 +11445,18 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", - "Model": "efficientdet-d0", - "featured_SKU": "false", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", + "Model": "falcon-7b-instruct", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 426.56, - "fp16": 362.73, + "int4": 17.7, + "int8": 10.82, + "fp16": 6.02, "fp32": "", "bf16": "" } @@ -12482,9 +11467,9 @@ "latency": { "Precisions": [ { - "int4": "", - "int8": 2.8, - "fp16": "", + "int4": 56.49, + "int8": 92.37, + "fp16": 166.08, "fp32": "", "bf16": "" } @@ -12495,30 +11480,30 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "gemma-2-9b", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 22.66, - "int8": 18.13, + "int4": 13.03, + "int8": 7.64, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 44.13, - "int8": 55.13, + "int4": 76.74, + "int8": 130.86, "fp16": "", "fp32": "", "bf16": "" @@ -12530,30 +11515,30 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "glm-4-9b-chat", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": "false", - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 40.04, - "int8": 26.95, + "int4": 15.23, + "int8": 8.11, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 24.97, - "int8": 37.1, + "int4": 65.65, + "int8": 123.23, "fp16": "", "fp32": "", "bf16": "" @@ -12565,31 +11550,31 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "llama-2-7b-chat", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 45.22, - "int8": 33.88, - "fp16": 21.45, + "int4": 16.8, + "int8": 10.77, + "fp16": 4.57, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 22.11, - "int8": 29.51, - "fp16": 46.62, + "int4": 59.5, + "int8": 92.8, + "fp16": 218.59, "fp32": "", "bf16": "" } @@ -12600,30 +11585,30 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "llama-3-8b", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": "false", - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 45.55, - "int8": 30.8, + "int4": 17.36, + "int8": 9.55, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 21.95, - "int8": 32.46, + "int4": 57.58, + "int8": 104.65, "fp16": "", "fp32": "", "bf16": "" @@ -12635,31 +11620,31 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "llama-3.2-3b-instruct", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 69.44, - "int8": 57.9, - "fp16": 37.69, + "int4": 31.23, + "int8": 20.44, + "fp16": 11.56, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 14.4, - "int8": 17.27, - "fp16": 26.53, + "int4": 32.02, + "int8": 48.9, + "fp16": 86.45, "fp32": "", "bf16": "" } @@ -12670,19 +11655,19 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 33.38, - "fp16": 19.04, - "fp32": "", + "int8": 12.33, + "fp16": 7.29, + "fp32": 7.29, "bf16": "" } ], @@ -12693,7 +11678,7 @@ "Precisions": [ { "int4": "", - "int8": 48.67, + "int8": 91.66, "fp16": "", "fp32": "", "bf16": "" @@ -12705,31 +11690,31 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "mistral-7b-v0.1", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 45.53, - "int8": 32.37, - "fp16": 20.21, + "int4": 17.13, + "int8": 10.23, + "fp16": 4.99, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 21.96, - "int8": 30.89, - "fp16": 49.48, + "int4": 58.35, + "int8": 97.71, + "fp16": 200.07, "fp32": "", "bf16": "" } @@ -12740,19 +11725,19 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "mobilenet-v2", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3134.27, - "fp16": 3004.5, - "fp32": "", + "int8": 941.33, + "fp16": 970.79, + "fp32": 1078.52, "bf16": "" } ], @@ -12763,7 +11748,7 @@ "Precisions": [ { "int4": "", - "int8": 0.57, + "int8": 1.3, "fp16": "", "fp32": "", "bf16": "" @@ -12775,31 +11760,31 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "phi-3-mini-4k-instruct", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 69.93, - "int8": 51.51, - "fp16": 32.84, + "int4": 26.93, + "int8": 17.85, + "fp16": 10.16, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 14.3, - "int8": 19.41, - "fp16": 30.45, + "int4": 37.13, + "int8": 56.01, + "fp16": 98.33, "fp32": "", "bf16": "" } @@ -12810,31 +11795,31 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "qwen2-7b", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 45.8, - "int8": 32.78, - "fp16": "", + "int4": 18.11, + "int8": 10.86, + "fp16": 5.1, "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 21.83, - "int8": 30.5, - "fp16": "", + "int4": 55.21, + "int8": 92.08, + "fp16": 195.84, "fp32": "", "bf16": "" } @@ -12845,19 +11830,19 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "resnet-50", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1921.18, - "fp16": 1329.28, - "fp32": "", + "int8": 856.81, + "fp16": 591.62, + "fp32": 580.46, "bf16": "" } ], @@ -12868,7 +11853,7 @@ "Precisions": [ { "int4": "", - "int8": 0.78, + "int8": 1.56, "fp16": "", "fp32": "", "bf16": "" @@ -12880,19 +11865,19 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "ssd-resnet34-1200", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 133.77, - "fp16": "", - "fp32": "", + "int8": 64.32, + "fp16": 38.26, + "fp32": 38.09, "bf16": "" } ], @@ -12903,7 +11888,7 @@ "Precisions": [ { "int4": "", - "int8": 13.93, + "int8": 20.89, "fp16": "", "fp32": "", "bf16": "" @@ -12915,19 +11900,19 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2200.83, - "fp16": 1665.15, - "fp32": "", + "int8": 836.44, + "fp16": 775.11, + "fp32": 866.31, "bf16": "" } ], @@ -12938,7 +11923,7 @@ "Precisions": [ { "int4": "", - "int8": 0.78, + "int8": 1.85, "fp16": "", "fp32": "", "bf16": "" @@ -12950,11 +11935,11 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", - "Model": "stable-diffusion-v1-5", - "featured_SKU": "false", - "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", + "Model": "yolo11", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ @@ -12962,7 +11947,7 @@ "int4": "", "int8": "", "fp16": "", - "fp32": "", + "fp32": 381.29, "bf16": "" } ], @@ -12973,8 +11958,8 @@ "Precisions": [ { "int4": "", - "int8": 2.33, - "fp16": 2.36, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -12985,18 +11970,53 @@ } }, { - "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Platform": "Intel® Core™ Ultra 9 processor 288V iGPU-only", "Model": "yolo_v8n", - "featured_SKU": "false", + "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Accelerator Platforms", + "PlatformType": "Intel® Core™, iGPU-only", + "Parameters": { + "throughput": { + "Precisions": [ + { + "int4": "", + "int8": 395.69, + "fp16": 373.09, + "fp32": 372.46, + "bf16": "" + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" + }, + "latency": { + "Precisions": [ + { + "int4": "", + "int8": 3.12, + "fp16": "", + "fp32": "", + "bf16": "" + } + ], + "Unit": "ms", + "UnitDesc": "lower is better" + } + } + }, + { + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", + "Model": "bert-base-cased", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 759.93, - "fp16": 694.57, + "int8": 44.81, + "fp16": 37.44, "fp32": "", "bf16": "" } @@ -13008,7 +12028,7 @@ "Precisions": [ { "int4": "", - "int8": 1.96, + "int8": 19.86, "fp16": "", "fp32": "", "bf16": "" @@ -13020,19 +12040,54 @@ } }, { - "Platform": "Intel® Processor N100 CPU+iGPU", + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, iGPU-only", + "Parameters": { + "throughput": { + "Precisions": [ + { + "int4": "", + "int8": 65.88, + "fp16": 49.29, + "fp32": "", + "bf16": "" + } + ], + "Unit": "FPS", + "UnitDesc": "higher is better" + }, + "latency": { + "Precisions": [ + { + "int4": "", + "int8": 19.48, + "fp16": "", + "fp32": "", + "bf16": "" + } + ], + "Unit": "ms", + "UnitDesc": "lower is better" + } + } + }, + { + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 36.93, + "int8": 0.5, "fp16": "", - "fp32": 27.64, + "fp32": "", "bf16": "" } ], @@ -13055,19 +12110,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU+iGPU", + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 484.32, - "fp16": "", - "fp32": 278.4, + "int8": 749.72, + "fp16": 471.55, + "fp32": "", "bf16": "" } ], @@ -13078,7 +12133,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.65, "fp16": "", "fp32": "", "bf16": "" @@ -13090,19 +12145,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU+iGPU", + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 112.23, - "fp16": "", - "fp32": 42.14, + "int8": 202.55, + "fp16": 115.74, + "fp32": "", "bf16": "" } ], @@ -13113,7 +12168,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 4.94, "fp16": "", "fp32": "", "bf16": "" @@ -13125,19 +12180,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU+iGPU", + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2.04, - "fp16": "", - "fp32": 0.6, + "int8": 5.54, + "fp16": 2.64, + "fp32": "", "bf16": "" } ], @@ -13148,7 +12203,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 173.03, "fp16": "", "fp32": "", "bf16": "" @@ -13160,19 +12215,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU+iGPU", + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 216.96, - "fp16": "", - "fp32": 94.92, + "int8": 372.94, + "fp16": 217.42, + "fp32": "", "bf16": "" } ], @@ -13183,7 +12238,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.1, "fp16": "", "fp32": "", "bf16": "" @@ -13195,19 +12250,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU+iGPU", - "Model": "yolo11", + "Platform": "Intel® Core™ i5-1235U Processor iGPU-only", + "Model": "yolo_v8n", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU+iGPU", + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 34.52, + "int8": 122.89, + "fp16": 76.56, + "fp32": "", "bf16": "" } ], @@ -13218,7 +12273,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 8.19, "fp16": "", "fp32": "", "bf16": "" @@ -13230,19 +12285,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU+iGPU", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU+iGPU", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 61.06, - "fp16": "", - "fp32": 28.61, + "int8": 47.22, + "fp16": 39.62, + "fp32": "", "bf16": "" } ], @@ -13253,7 +12308,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 18.47, "fp16": "", "fp32": "", "bf16": "" @@ -13265,19 +12320,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU-only", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 15.44, - "fp16": "", - "fp32": 12.75, + "int8": 80.29, + "fp16": 59.69, + "fp32": "", "bf16": "" } ], @@ -13288,7 +12343,7 @@ "Precisions": [ { "int4": "", - "int8": 66.23, + "int8": 14.57, "fp16": "", "fp32": "", "bf16": "" @@ -13300,19 +12355,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 296.53, + "int8": 0.53, "fp16": "", - "fp32": 183.3, + "fp32": "", "bf16": "" } ], @@ -13323,7 +12378,7 @@ "Precisions": [ { "int4": "", - "int8": 3.8, + "int8": 1510.69, "fp16": "", "fp32": "", "bf16": "" @@ -13335,19 +12390,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 48.77, - "fp16": "", - "fp32": 20.13, + "int8": 771.99, + "fp16": 512.35, + "fp32": "", "bf16": "" } ], @@ -13358,7 +12413,7 @@ "Precisions": [ { "int4": "", - "int8": 21.88, + "int8": 1.49, "fp16": "", "fp32": "", "bf16": "" @@ -13370,19 +12425,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 0.82, - "fp16": "", - "fp32": 0.31, + "int8": 223.26, + "fp16": 127.1, + "fp32": "", "bf16": "" } ], @@ -13393,7 +12448,7 @@ "Precisions": [ { "int4": "", - "int8": 1224.62, + "int8": 4.32, "fp16": "", "fp32": "", "bf16": "" @@ -13405,19 +12460,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 106.12, - "fp16": "", - "fp32": 49.52, + "int8": 5.8, + "fp16": 2.85, + "fp32": "", "bf16": "" } ], @@ -13428,7 +12483,7 @@ "Precisions": [ { "int4": "", - "int8": 9.72, + "int8": 144.65, "fp16": "", "fp32": "", "bf16": "" @@ -13440,19 +12495,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU-only", - "Model": "yolo11", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Atom™, CPU-only", + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 15.36, + "int8": 407.72, + "fp16": 234.08, + "fp32": "", "bf16": "" } ], @@ -13463,7 +12518,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 2.76, "fp16": "", "fp32": "", "bf16": "" @@ -13475,19 +12530,19 @@ } }, { - "Platform": "Intel® Processor N100 CPU-only", + "Platform": "Intel® Core™ i5-1335U Processor iGPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 23.65, - "fp16": "", - "fp32": 12.86, + "int8": 130.6, + "fp16": 82.83, + "fp32": "", "bf16": "" } ], @@ -13498,7 +12553,7 @@ "Precisions": [ { "int4": "", - "int8": 43.43, + "int8": 7.1, "fp16": "", "fp32": "", "bf16": "" @@ -13510,18 +12565,18 @@ } }, { - "Platform": "Intel® Processor N100 iGPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 33.69, - "fp16": 30.91, + "int8": 68.08, + "fp16": 53.55, "fp32": "", "bf16": "" } @@ -13533,7 +12588,7 @@ "Precisions": [ { "int4": "", - "int8": 38.02, + "int8": 17.09, "fp16": "", "fp32": "", "bf16": "" @@ -13545,18 +12600,18 @@ } }, { - "Platform": "Intel® Processor N100 iGPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 337.95, - "fp16": 267.38, + "int8": 91.72, + "fp16": 72.17, "fp32": "", "bf16": "" } @@ -13568,7 +12623,7 @@ "Precisions": [ { "int4": "", - "int8": 3.84, + "int8": 18.1, "fp16": "", "fp32": "", "bf16": "" @@ -13580,18 +12635,18 @@ } }, { - "Platform": "Intel® Processor N100 iGPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 81.72, - "fp16": 49.76, + "int8": 0.82, + "fp16": "", "fp32": "", "bf16": "" } @@ -13603,7 +12658,7 @@ "Precisions": [ { "int4": "", - "int8": 13.15, + "int8": 1130.75, "fp16": "", "fp32": "", "bf16": "" @@ -13615,18 +12670,18 @@ } }, { - "Platform": "Intel® Processor N100 iGPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1.62, - "fp16": 1.01, + "int8": 720.79, + "fp16": 566.9, "fp32": "", "bf16": "" } @@ -13638,7 +12693,7 @@ "Precisions": [ { "int4": "", - "int8": 622.97, + "int8": 2.08, "fp16": "", "fp32": "", "bf16": "" @@ -13650,18 +12705,18 @@ } }, { - "Platform": "Intel® Processor N100 iGPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 164.31, - "fp16": 106.85, + "int8": 265.78, + "fp16": 174.38, "fp32": "", "bf16": "" } @@ -13673,7 +12728,7 @@ "Precisions": [ { "int4": "", - "int8": 7.35, + "int8": 4.84, "fp16": "", "fp32": "", "bf16": "" @@ -13685,18 +12740,18 @@ } }, { - "Platform": "Intel® Processor N100 iGPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Atom™, iGPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 47.04, - "fp16": 34.97, + "int8": 8.24, + "fp16": 4.65, "fp32": "", "bf16": "" } @@ -13708,7 +12763,7 @@ "Precisions": [ { "int4": "", - "int8": 23.03, + "int8": 118.74, "fp16": "", "fp32": "", "bf16": "" @@ -13720,19 +12775,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 218.18, - "fp16": "", - "fp32": 80.36, + "int8": 455.85, + "fp16": 299.6, + "fp32": "", "bf16": "" } ], @@ -13743,7 +12798,7 @@ "Precisions": [ { "int4": "", - "int8": 14.4, + "int8": 3.33, "fp16": "", "fp32": "", "bf16": "" @@ -13755,19 +12810,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i7-1185G7 iGPU-only", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 271.94, - "fp16": "", - "fp32": 167.25, + "int8": 160.91, + "fp16": 111.69, + "fp32": "", "bf16": "" } ], @@ -13778,7 +12833,7 @@ "Precisions": [ { "int4": "", - "int8": 11.07, + "int8": 8.16, "fp16": "", "fp32": "", "bf16": "" @@ -13790,19 +12845,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3.26, - "fp16": "", - "fp32": 0.9, + "int8": 47.65, + "fp16": 39.47, + "fp32": "", "bf16": "" } ], @@ -13813,7 +12868,7 @@ "Precisions": [ { "int4": "", - "int8": 637.88, + "int8": 21.58, "fp16": "", "fp32": "", "bf16": "" @@ -13825,19 +12880,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 5417.98, - "fp16": "", - "fp32": 1926.0, + "int8": 56.45, + "fp16": 41.79, + "fp32": "", "bf16": "" } ], @@ -13848,7 +12903,7 @@ "Precisions": [ { "int4": "", - "int8": 1.45, + "int8": 23.58, "fp16": "", "fp32": "", "bf16": "" @@ -13860,19 +12915,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 979.5, - "fp16": "", - "fp32": 267.16, + "int8": 0.54, + "fp16": 0.55, + "fp32": "", "bf16": "" } ], @@ -13883,7 +12938,7 @@ "Precisions": [ { "int4": "", - "int8": 3.06, + "int8": 1632.29, "fp16": "", "fp32": "", "bf16": "" @@ -13895,19 +12950,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 17.65, - "fp16": "", - "fp32": 4.58, + "int8": 625.04, + "fp16": 428.1, + "fp32": "", "bf16": "" } ], @@ -13918,7 +12973,7 @@ "Precisions": [ { "int4": "", - "int8": 116.19, + "int8": 1.79, "fp16": "", "fp32": "", "bf16": "" @@ -13930,19 +12985,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2104.85, - "fp16": "", - "fp32": 639.65, + "int8": 203.94, + "fp16": 116.2, + "fp32": "", "bf16": "" } ], @@ -13953,7 +13008,7 @@ "Precisions": [ { "int4": "", - "int8": 1.56, + "int8": 5.45, "fp16": "", "fp32": "", "bf16": "" @@ -13965,19 +13020,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "yolo11", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 206.18, + "int8": 5.47, + "fp16": 3.15, + "fp32": "", "bf16": "" } ], @@ -13988,7 +13043,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 169.33, "fp16": "", "fp32": "", "bf16": "" @@ -14000,19 +13055,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 5218T CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 440.56, - "fp16": "", - "fp32": 173.57, + "int8": 363.85, + "fp16": 227.22, + "fp32": "", "bf16": "" } ], @@ -14023,7 +13078,7 @@ "Precisions": [ { "int4": "", - "int8": 5.93, + "int8": 3.43, "fp16": "", "fp32": "", "bf16": "" @@ -14035,19 +13090,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i7-1185GRE iGPU-only", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 426.19, - "fp16": "", - "fp32": 162.63, + "int8": 112.45, + "fp16": 76.19, + "fp32": "", "bf16": "" } ], @@ -14058,7 +13113,7 @@ "Precisions": [ { "int4": "", - "int8": 11.09, + "int8": 9.34, "fp16": "", "fp32": "", "bf16": "" @@ -14070,19 +13125,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 411.51, - "fp16": "", - "fp32": 254.65, + "int8": 90.29, + "fp16": 70.31, + "fp32": "", "bf16": "" } ], @@ -14093,7 +13148,7 @@ "Precisions": [ { "int4": "", - "int8": 8.51, + "int8": 12.83, "fp16": "", "fp32": "", "bf16": "" @@ -14105,19 +13160,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 6.45, - "fp16": "", - "fp32": 1.65, + "int8": 128.75, + "fp16": 97.8, + "fp32": "", "bf16": "" } ], @@ -14128,7 +13183,7 @@ "Precisions": [ { "int4": "", - "int8": 321.85, + "int8": 12.86, "fp16": "", "fp32": "", "bf16": "" @@ -14140,19 +13195,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 10273.19, + "int8": 1.04, "fp16": "", - "fp32": 3342.96, + "fp32": "", "bf16": "" } ], @@ -14163,7 +13218,7 @@ "Precisions": [ { "int4": "", - "int8": 1.21, + "int8": 973.17, "fp16": "", "fp32": "", "bf16": "" @@ -14175,19 +13230,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2125.81, - "fp16": "", - "fp32": 570.61, + "int8": 1284.15, + "fp16": 902.47, + "fp32": "", "bf16": "" } ], @@ -14198,7 +13253,7 @@ "Precisions": [ { "int4": "", - "int8": 1.84, + "int8": 1.11, "fp16": "", "fp32": "", "bf16": "" @@ -14210,19 +13265,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 41.83, - "fp16": "", - "fp32": 10.91, + "int8": 384.22, + "fp16": 227.94, + "fp32": "", "bf16": "" } ], @@ -14233,7 +13288,7 @@ "Precisions": [ { "int4": "", - "int8": 49.53, + "int8": 3.29, "fp16": "", "fp32": "", "bf16": "" @@ -14245,19 +13300,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 4376.71, - "fp16": "", - "fp32": 1244.57, + "int8": 10.48, + "fp16": 6.14, + "fp32": "", "bf16": "" } ], @@ -14268,7 +13323,7 @@ "Precisions": [ { "int4": "", - "int8": 1.22, + "int8": 100.2, "fp16": "", "fp32": "", "bf16": "" @@ -14280,19 +13335,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", - "Model": "yolo11", + "Platform": "Intel® Core™ i7-12700H iGPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 383.86, + "int8": 744.28, + "fp16": 414.98, + "fp32": "", "bf16": "" } ], @@ -14303,7 +13358,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1.88, "fp16": "", "fp32": "", "bf16": "" @@ -14315,19 +13370,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6238L CPU-only", + "Platform": "Intel® Core™ i7-12700H iGPU-only", "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 749.14, - "fp16": "", - "fp32": 338.04, + "int8": 217.24, + "fp16": 148.8, + "fp32": "", "bf16": "" } ], @@ -14338,7 +13393,7 @@ "Precisions": [ { "int4": "", - "int8": 4.21, + "int8": 5.62, "fp16": "", "fp32": "", "bf16": "" @@ -14350,19 +13405,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", "Model": "bert-base-cased", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 622.71, - "fp16": "", - "fp32": 240.52, + "int8": 66.74, + "fp16": 52.5, + "fp32": "", "bf16": "" } ], @@ -14373,7 +13428,7 @@ "Precisions": [ { "int4": "", - "int8": 6.4, + "int8": 14.42, "fp16": "", "fp32": "", "bf16": "" @@ -14385,19 +13440,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", "Model": "efficientdet-d0", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 721.9, - "fp16": "", - "fp32": 423.3, + "int8": 98.48, + "fp16": 73.51, + "fp32": "", "bf16": "" } ], @@ -14408,7 +13463,7 @@ "Precisions": [ { "int4": "", - "int8": 4.83, + "int8": 13.42, "fp16": "", "fp32": "", "bf16": "" @@ -14420,30 +13475,30 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "gemma-2-9b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 10.46, + "int4": 6.2, + "int8": 3.87, "fp16": "", - "fp32": 2.45, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 221.46, + "int4": 161.18, + "int8": 258.1, "fp16": "", "fp32": "", "bf16": "" @@ -14455,30 +13510,30 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "glm-4-9b-chat", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": "false", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 16509.95, + "int4": 7.35, + "int8": 4.32, "fp16": "", - "fp32": 5201.56, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 0.59, + "int4": 135.94, + "int8": 231.26, "fp16": "", "fp32": "", "bf16": "" @@ -14490,30 +13545,30 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "llama-2-7b-chat", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 3352.09, + "int4": 8.49, + "int8": 5.7, "fp16": "", - "fp32": 825.5, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1.34, + "int4": 117.65, + "int8": 175.21, "fp16": "", "fp32": "", "bf16": "" @@ -14525,30 +13580,30 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "llama-3-8b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": "false", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 60.91, + "int4": 8.67, + "int8": 5.1, "fp16": "", - "fp32": 15.11, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 36.91, + "int4": 115.26, + "int8": 195.88, "fp16": "", "fp32": "", "bf16": "" @@ -14560,31 +13615,31 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "llama-3.2-3b-instruct", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 6975.09, - "fp16": "", - "fp32": 1755.62, + "int4": 16.31, + "int8": 11.96, + "fp16": 6.49, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 0.77, - "fp16": "", + "int4": 61.29, + "int8": 83.56, + "fp16": 153.99, "fp32": "", "bf16": "" } @@ -14595,19 +13650,19 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", - "Model": "yolo11", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.73, "fp16": "", - "fp32": 571.3, + "fp32": "", "bf16": "" } ], @@ -14618,7 +13673,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 1197.61, "fp16": "", "fp32": "", "bf16": "" @@ -14630,30 +13685,30 @@ } }, { - "Platform": "Intel® Xeon® Gold 6338N CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "mistral-7b-v0.1", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 1224.86, + "int4": 8.95, + "int8": 5.55, "fp16": "", - "fp32": 495.73, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 2.98, + "int4": 111.61, + "int8": 180.11, "fp16": "", "fp32": "", "bf16": "" @@ -14665,19 +13720,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "mobilenet-v2", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 587.54, - "fp16": "", - "fp32": 225.64, + "int8": 874.69, + "fp16": 627.62, + "fp32": "", "bf16": "" } ], @@ -14688,7 +13743,7 @@ "Precisions": [ { "int4": "", - "int8": 9.18, + "int8": 1.37, "fp16": "", "fp32": "", "bf16": "" @@ -14700,31 +13755,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "phi-3-mini-4k-instruct", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 580.8, - "fp16": "", - "fp32": 343.39, + "int8": 9.98, + "fp16": 5.34, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 6.9, - "fp16": "", + "int8": 100.15, + "fp16": 187.11, "fp32": "", "bf16": "" } @@ -14735,30 +13790,30 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "qwen2-7b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 8.58, + "int4": 9.11, + "int8": 5.41, "fp16": "", - "fp32": 2.26, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 248.72, + "int4": 109.7, + "int8": 184.76, "fp16": "", "fp32": "", "bf16": "" @@ -14770,19 +13825,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "resnet-50", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 14930.31, - "fp16": "", - "fp32": 4646.16, + "int8": 276.72, + "fp16": 163.88, + "fp32": "", "bf16": "" } ], @@ -14793,7 +13848,7 @@ "Precisions": [ { "int4": "", - "int8": 0.93, + "int8": 3.86, "fp16": "", "fp32": "", "bf16": "" @@ -14805,19 +13860,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "ssd-resnet34-1200", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 2965.31, - "fp16": "", - "fp32": 761.01, + "int8": 7.08, + "fp16": 3.98, + "fp32": "", "bf16": "" } ], @@ -14828,7 +13883,7 @@ "Precisions": [ { "int4": "", - "int8": 1.59, + "int8": 127.65, "fp16": "", "fp32": "", "bf16": "" @@ -14840,19 +13895,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 58.15, - "fp16": "", - "fp32": 15.0, + "int8": 484.68, + "fp16": 298.46, + "fp32": "", "bf16": "" } ], @@ -14863,7 +13918,7 @@ "Precisions": [ { "int4": "", - "int8": 37.18, + "int8": 2.46, "fp16": "", "fp32": "", "bf16": "" @@ -14875,19 +13930,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Core™ i7-1355U Processor iGPU-only", + "Model": "yolo_v8n", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Intel® Core™, iGPU-only", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 6130.48, - "fp16": "", - "fp32": 1654.84, + "int8": 163.2, + "fp16": 106.34, + "fp32": "", "bf16": "" } ], @@ -14898,7 +13953,7 @@ "Precisions": [ { "int4": "", - "int8": 1.2, + "int8": 6.36, "fp16": "", "fp32": "", "bf16": "" @@ -14910,19 +13965,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "yolo11", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "efficientdet-d0", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 512.57, + "int8": 146.34, + "fp16": 116.04, + "fp32": "", "bf16": "" } ], @@ -14933,7 +13988,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 9.88, "fp16": "", "fp32": "", "bf16": "" @@ -14945,30 +14000,30 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8280 CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "llama-3-8b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": "false", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 996.59, + "int4": 10.75, + "int8": "", "fp16": "", - "fp32": 452.05, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 3.6, + "int4": 93.02, + "int8": "", "fp16": "", "fp32": "", "bf16": "" @@ -14980,30 +14035,30 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "llama-3.2-3b-instruct", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 881.04, + "int4": 22.54, + "int8": 16.7, "fp16": "", - "fp32": 338.12, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 5.18, + "int4": 44.35, + "int8": 59.85, "fp16": "", "fp32": "", "bf16": "" @@ -15015,19 +14070,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": false, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1009.71, + "int8": 5.93, "fp16": "", - "fp32": 562.38, + "fp32": "", "bf16": "" } ], @@ -15038,7 +14093,7 @@ "Precisions": [ { "int4": "", - "int8": 4.28, + "int8": 189.11, "fp16": "", "fp32": "", "bf16": "" @@ -15050,31 +14105,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "gemma-2-9b", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "mistral-7b-v0.1", "featured_SKU": false, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 20.78, - "int8": 14.18, - "fp16": 7.72, + "int4": 12.03, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 48.12, - "int8": 70.5, - "fp16": 129.51, + "int4": 83.06, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -15085,31 +14140,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "glm-4-9b-chat", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "phi-3-mini-4k-instruct", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 22.79, - "int8": 15.56, - "fp16": 8.48, + "int4": 20.08, + "int8": 13.67, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 43.86, - "int8": 64.26, - "fp16": 117.92, + "int4": 49.78, + "int8": 73.11, + "fp16": "", "fp32": "", "bf16": "" } @@ -15120,31 +14175,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "llama-2-7b-chat", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "qwen2-7b", "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 25.41, - "int8": 18.68, - "fp16": 10.61, + "int4": 11.39, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 39.34, - "int8": 53.51, - "fp16": 94.17, + "int4": 87.76, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -15155,18 +14210,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "llama-3-8b", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "resnet-50", "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 26.07, - "int8": 17.66, - "fp16": 9.72, + "int4": "", + "int8": "", + "fp16": 504.64, "fp32": "", "bf16": "" } @@ -15177,9 +14232,9 @@ "latency": { "Precisions": [ { - "int4": 38.35, - "int8": 56.62, - "fp16": 102.88, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -15190,18 +14245,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "llama-3.2-3b-instruct", + "Platform": "Intel® Data Center GPU Flex 140 dGPU", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": false, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 46.81, - "int8": 33.54, - "fp16": 19.32, + "int4": "", + "int8": 983.44, + "fp16": 762.41, "fp32": "", "bf16": "" } @@ -15212,9 +14267,9 @@ "latency": { "Precisions": [ { - "int4": 21.36, - "int8": 29.81, - "fp16": 51.74, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -15225,19 +14280,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", - "featured_SKU": false, + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "bert-base-cased", + "featured_SKU": "false", "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 14.73, - "fp16": "", - "fp32": 3.42, + "int8": 385.63, + "fp16": 437.6, + "fp32": "", "bf16": "" } ], @@ -15248,7 +14303,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 3.0, "fp16": "", "fp32": "", "bf16": "" @@ -15260,18 +14315,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "mistral-7b-v0.1", - "featured_SKU": false, + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "efficientdet-d0", + "featured_SKU": "false", "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 26.89, - "int8": 18.54, - "fp16": 10.22, + "int4": "", + "int8": 425.95, + "fp16": 365.18, "fp32": "", "bf16": "" } @@ -15282,9 +14337,9 @@ "latency": { "Precisions": [ { - "int4": 37.18, - "int8": 53.93, - "fp16": 97.8, + "int4": "", + "int8": 2.79, + "fp16": "", "fp32": "", "bf16": "" } @@ -15295,30 +14350,30 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "mobilenet-v2", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "gemma-2-9b", + "featured_SKU": "false", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 22703.47, + "int4": 22.54, + "int8": 18.33, "fp16": "", - "fp32": 6937.71, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 0.58, + "int4": 44.35, + "int8": 54.53, "fp16": "", "fp32": "", "bf16": "" @@ -15330,31 +14385,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "phi-3-mini-4k-instruct", - "featured_SKU": false, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "glm-4-9b-chat", + "featured_SKU": "false", + "whats_new_model": "false", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 39.41, - "int8": 29.28, - "fp16": 17.35, + "int4": 39.46, + "int8": 26.75, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 25.37, - "int8": 34.15, - "fp16": 57.61, + "int4": 25.34, + "int8": 37.37, + "fp16": "", "fp32": "", "bf16": "" } @@ -15365,31 +14420,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "qwen2-7b", - "featured_SKU": false, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "llama-2-7b-chat", + "featured_SKU": "false", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 28.26, - "int8": 19.32, - "fp16": 10.27, + "int4": 44.66, + "int8": 33.45, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 35.38, - "int8": 51.74, - "fp16": 97.35, + "int4": 22.39, + "int8": 29.89, + "fp16": "", "fp32": "", "bf16": "" } @@ -15400,30 +14455,30 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "resnet-50", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "llama-3-8b", + "featured_SKU": "false", + "whats_new_model": "false", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 4874.95, + "int4": 44.86, + "int8": 30.51, "fp16": "", - "fp32": 1144.73, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1.07, + "int4": 22.29, + "int8": 32.77, "fp16": "", "fp32": "", "bf16": "" @@ -15435,31 +14490,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "ssd-resnet34-1200", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "llama-3.2-3b-instruct", + "featured_SKU": "false", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 84.6, - "fp16": "", - "fp32": 20.95, + "int4": 71.27, + "int8": 57.11, + "fp16": 36.83, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": "", - "fp16": "", + "int4": 14.03, + "int8": 17.51, + "fp16": 27.15, "fp32": "", "bf16": "" } @@ -15470,19 +14525,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "ssd_mobilenet_v1_coco", - "featured_SKU": false, + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "mask_rcnn_resnet50_atrous_coco", + "featured_SKU": "false", "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 10174.18, - "fp16": "", - "fp32": 2524.59, + "int8": 32.44, + "fp16": 18.63, + "fp32": "", "bf16": "" } ], @@ -15493,7 +14548,7 @@ "Precisions": [ { "int4": "", - "int8": 0.7, + "int8": 47.53, "fp16": "", "fp32": "", "bf16": "" @@ -15505,31 +14560,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "stable-diffusion-v1-5", - "featured_SKU": false, + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "mistral-7b-v0.1", + "featured_SKU": "false", "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": "", + "int4": 44.42, + "int8": 32.11, "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 13.34, - "fp16": 13.66, + "int4": 22.51, + "int8": 31.14, + "fp16": "", "fp32": "", "bf16": "" } @@ -15540,19 +14595,19 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "yolo11", - "featured_SKU": false, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "mobilenet-v2", + "featured_SKU": "false", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 803.12, + "int8": 2978.92, + "fp16": 3132.03, + "fp32": "", "bf16": "" } ], @@ -15563,7 +14618,7 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 0.57, "fp16": "", "fp32": "", "bf16": "" @@ -15575,31 +14630,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8380 CPU-only", - "Model": "yolo_v8n", - "featured_SKU": false, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "phi-3-mini-4k-instruct", + "featured_SKU": "false", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 1704.08, - "fp16": "", - "fp32": 697.23, + "int4": 69.97, + "int8": 50.91, + "fp16": 32.48, + "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 2.36, - "fp16": "", + "int4": 14.29, + "int8": 19.64, + "fp16": 30.78, "fp32": "", "bf16": "" } @@ -15610,33 +14665,33 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "bert-base-cased", - "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "qwen2-7b", + "featured_SKU": "false", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 3023.92, + "int4": 45.02, + "int8": 32.48, "fp16": "", - "fp32": 483.11, - "bf16": 1976.63 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 3.79, + "int4": 22.21, + "int8": 30.78, "fp16": "", "fp32": "", - "bf16": 4.84 + "bf16": "" } ], "Unit": "ms", @@ -15645,20 +14700,20 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "efficientdet-d0", - "featured_SKU": true, + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "resnet-50", + "featured_SKU": "false", "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 1445.78, - "fp16": "", - "fp32": 861.51, - "bf16": 1021.75 + "int8": 1971.27, + "fp16": 1355.77, + "fp32": "", + "bf16": "" } ], "Unit": "FPS", @@ -15668,10 +14723,10 @@ "Precisions": [ { "int4": "", - "int8": 4.69, + "int8": 0.78, "fp16": "", "fp32": "", - "bf16": 5.16 + "bf16": "" } ], "Unit": "ms", @@ -15680,18 +14735,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "gemma-2-9b", - "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "ssd-resnet34-1200", + "featured_SKU": "false", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 22.71, - "int8": 16.83, - "fp16": 10.76, + "int4": "", + "int8": 132.85, + "fp16": 80.37, "fp32": "", "bf16": "" } @@ -15702,9 +14757,9 @@ "latency": { "Precisions": [ { - "int4": 44.03, - "int8": 59.39, - "fp16": 92.87, + "int4": "", + "int8": 13.81, + "fp16": "", "fp32": "", "bf16": "" } @@ -15715,18 +14770,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "glm-4-9b-chat", - "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "ssd_mobilenet_v1_coco", + "featured_SKU": "false", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 23.7, - "int8": 16.93, - "fp16": 11.27, + "int4": "", + "int8": 2274.66, + "fp16": 1667.73, "fp32": "", "bf16": "" } @@ -15737,9 +14792,9 @@ "latency": { "Precisions": [ { - "int4": 42.19, - "int8": 59.04, - "fp16": 88.67, + "int4": "", + "int8": 0.78, + "fp16": "", "fp32": "", "bf16": "" } @@ -15750,18 +14805,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "llama-2-7b-chat", - "featured_SKU": true, + "Platform": "Intel® Data Center GPU Flex 170 dGPU", + "Model": "yolo_v8n", + "featured_SKU": "false", "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 26.11, - "int8": 20.1, - "fp16": 14.19, + "int4": "", + "int8": 756.78, + "fp16": 691.63, "fp32": "", "bf16": "" } @@ -15772,9 +14827,9 @@ "latency": { "Precisions": [ { - "int4": 38.29, - "int8": 49.73, - "fp16": 70.45, + "int4": "", + "int8": 1.97, + "fp16": "", "fp32": "", "bf16": "" } @@ -15785,20 +14840,20 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "llama-3-8b", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "bert-base-cased", "featured_SKU": true, - "whats_new_model": "false", + "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 26.02, - "int8": 18.97, - "fp16": 13.23, - "fp32": "", - "bf16": "" + "int4": "", + "int8": 8626.80, + "fp16": "", + "fp32": 1267.91, + "bf16": 6050.76 } ], "Unit": "FPS", @@ -15807,9 +14862,9 @@ "latency": { "Precisions": [ { - "int4": 38.42, - "int8": 52.71, - "fp16": 75.57, + "int4": "", + "int8": 7.68, + "fp16": "", "fp32": "", "bf16": "" } @@ -15820,20 +14875,20 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "llama-3.2-3b-instruct", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "efficientdet-d0", "featured_SKU": true, - "whats_new_model": true, + "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 45.68, - "int8": 36.96, - "fp16": 27.27, - "fp32": "", - "bf16": "" + "int4": "", + "int8": 3411.17, + "fp16": "", + "fp32": 2334.30, + "bf16": 2889.81 } ], "Unit": "FPS", @@ -15842,9 +14897,9 @@ "latency": { "Precisions": [ { - "int4": 21.89, - "int8": 27.05, - "fp16": 36.67, + "int4": "", + "int8": 9.21, + "fp16": "", "fp32": "", "bf16": "" } @@ -15855,33 +14910,33 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "gemma-2-9b", "featured_SKU": true, - "whats_new_model": false, + "whats_new_model": true, "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 62.13, + "int4": 119.07, + "int8": "", "fp16": "", - "fp32": 5.19, - "bf16": 37.54 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 58.49, + "int8": "", "fp16": "", "fp32": "", - "bf16": 81.95 + "bf16": "" } ], "Unit": "ms", @@ -15890,31 +14945,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "mistral-7b-v0.1", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "glm-4-9b-chat", "featured_SKU": true, - "whats_new_model": false, + "whats_new_model": "false", "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 27.42, - "int8": 19.9, - "fp16": 13.72, + "int4": 140.9, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 36.46, - "int8": 50.24, - "fp16": 72.84, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -15925,8 +14980,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "llama-2-7b-chat", "featured_SKU": true, "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", @@ -15934,24 +14989,24 @@ "throughput": { "Precisions": [ { - "int4": "", - "int8": 38538.65, + "int4": 149.6, + "int8": "", "fp16": "", - "fp32": 10274.08, - "bf16": 25608.67 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 0.65, + "int8": "", "fp16": "", "fp32": "", - "bf16": 0.66 + "bf16": "" } ], "Unit": "ms", @@ -15960,31 +15015,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "phi-3-mini-4k-instruct", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "llama-3-8b", "featured_SKU": true, - "whats_new_model": true, + "whats_new_model": "false", "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 33.53, - "fp16": 23.1, + "int4": 153.0, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 24.06, - "int8": 29.82, - "fp16": 43.29, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -15995,8 +15050,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "qwen2-7b", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "llama-3.2-3b-instruct", "featured_SKU": true, "whats_new_model": true, "PlatformType": "Intel® Xeon®, CPU-only", @@ -16004,22 +15059,22 @@ "throughput": { "Precisions": [ { - "int4": 30.03, - "int8": 22.14, - "fp16": 13.95, + "int4": "277.1", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 33.3, - "int8": 45.16, - "fp16": 71.68, + "int4": "", + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -16030,8 +15085,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": true, "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", @@ -16040,10 +15095,10 @@ "Precisions": [ { "int4": "", - "int8": 19226.96, + "int8": 144.22, "fp16": "", - "fp32": 1597.37, - "bf16": 7480.12 + "fp32": 12.07, + "bf16": 90.67 } ], "Unit": "FPS", @@ -16053,10 +15108,10 @@ "Precisions": [ { "int4": "", - "int8": 1.01, + "int8": 76.77, "fp16": "", "fp32": "", - "bf16": 1.25 + "bf16": "" } ], "Unit": "ms", @@ -16065,8 +15120,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "mistral-7b-v0.1", "featured_SKU": true, "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", @@ -16074,14 +15129,14 @@ "throughput": { "Precisions": [ { - "int4": "", - "int8": 434.12, + "int4": 154.9, + "int8": "", "fp16": "", - "fp32": 30.6, - "bf16": 209.11 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { @@ -16100,8 +15155,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "mobilenet-v2", "featured_SKU": true, "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", @@ -16110,10 +15165,10 @@ "Precisions": [ { "int4": "", - "int8": 24134.02, + "int8": 37588.55, "fp16": "", - "fp32": 3392.4, - "bf16": 12168.49 + "fp32": 19668.20, + "bf16": 22418.62 } ], "Unit": "FPS", @@ -16123,10 +15178,10 @@ "Precisions": [ { "int4": "", - "int8": 0.74, + "int8": 1.81, "fp16": "", "fp32": "", - "bf16": 0.89 + "bf16": "" } ], "Unit": "ms", @@ -16135,31 +15190,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "stable-diffusion-v1-5", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "phi-3-mini-4k-instruct", "featured_SKU": true, - "whats_new_model": false, + "whats_new_model": true, "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", + "int4": 225.5, "int8": "", "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 4.62, - "fp16": 4.55, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -16170,23 +15225,23 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "yolo11", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "qwen2-7b", "featured_SKU": true, - "whats_new_model": "false", + "whats_new_model": true, "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", + "int4": 165.5, "int8": "", "fp16": "", - "fp32": 1034.68, - "bf16": 2068.81 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { @@ -16205,8 +15260,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8480+ CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "resnet-50", "featured_SKU": true, "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", @@ -16215,10 +15270,10 @@ "Precisions": [ { "int4": "", - "int8": 2380.51, + "int8": 29152.17, "fp16": "", - "fp32": 950.6, - "bf16": 2374.89 + "fp32": 4342.04, + "bf16": 21711.45 } ], "Unit": "FPS", @@ -16228,10 +15283,10 @@ "Precisions": [ { "int4": "", - "int8": 3.13, + "int8": 2.17, "fp16": "", "fp32": "", - "bf16": 2.54 + "bf16": "" } ], "Unit": "ms", @@ -16240,8 +15295,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "ssd-resnet34-1200", "featured_SKU": true, "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", @@ -16250,10 +15305,10 @@ "Precisions": [ { "int4": "", - "int8": 4671.04, + "int8": 997.71, "fp16": "", - "fp32": 560.3, - "bf16": 3211.93 + "fp32": 78.49, + "bf16": 517.96 } ], "Unit": "FPS", @@ -16263,10 +15318,10 @@ "Precisions": [ { "int4": "", - "int8": 3.66, + "int8": 10.12, "fp16": "", "fp32": "", - "bf16": 4.77 + "bf16": "" } ], "Unit": "ms", @@ -16275,8 +15330,8 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": true, "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", @@ -16285,10 +15340,10 @@ "Precisions": [ { "int4": "", - "int8": 1725.13, + "int8": 33085.55, "fp16": "", - "fp32": 1123.04, - "bf16": 1407.69 + "fp32": 9524.47, + "bf16": 16162.95 } ], "Unit": "FPS", @@ -16298,10 +15353,10 @@ "Precisions": [ { "int4": "", - "int8": 4.71, + "int8": 1.82, "fp16": "", "fp32": "", - "bf16": 4.84 + "bf16": "" } ], "Unit": "ms", @@ -16310,20 +15365,20 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "gemma-2-9b", + "Platform": "Intel® Xeon® Platinum 6979P", + "Model": "yolo_v8n", "featured_SKU": true, - "whats_new_model": true, + "whats_new_model": false, "PlatformType": "Intel® Xeon®, CPU-only", "Parameters": { "throughput": { "Precisions": [ { - "int4": 25.46, - "int8": 18.96, - "fp16": 12.14, - "fp32": "", - "bf16": "" + "int4": "", + "int8": 5975.29, + "fp16": "", + "fp32": 2698.86, + "bf16": 6021.30 } ], "Unit": "FPS", @@ -16332,9 +15387,9 @@ "latency": { "Precisions": [ { - "int4": 39.27, - "int8": 52.74, - "fp16": 82.36, + "int4": "", + "int8": 5.93, + "fp16": "", "fp32": "", "bf16": "" } @@ -16345,18 +15400,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "glm-4-9b-chat", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "bert-base-cased", "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 27.1, - "int8": 19.33, - "fp16": 12.69, + "int4": "", + "int8": 848.32, + "fp16": 743.75, "fp32": "", "bf16": "" } @@ -16367,9 +15422,9 @@ "latency": { "Precisions": [ { - "int4": 36.9, - "int8": 51.72, - "fp16": 78.77, + "int4": "", + "int8": 1.37, + "fp16": "", "fp32": "", "bf16": "" } @@ -16380,18 +15435,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "llama-2-7b-chat", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "efficientdet-d0", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 31.71, - "int8": 23.05, - "fp16": 16.64, + "int4": "", + "int8": 300.16, + "fp16": 313.15, "fp32": "", "bf16": "" } @@ -16402,9 +15457,9 @@ "latency": { "Precisions": [ { - "int4": 31.53, - "int8": 43.37, - "fp16": 60.07, + "int4": "", + "int8": 5.89, + "fp16": "", "fp32": "", "bf16": "" } @@ -16415,31 +15470,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "llama-3-8b", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "gemma-2-9b", "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 30.06, - "int8": 21.73, - "fp16": 14.93, + "int4": 45.3, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 33.26, - "int8": 46.01, - "fp16": 66.97, + "int4": 22.05, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -16450,31 +15505,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "llama-3.2-3b-instruct", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "glm-4-9b-chat", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": "false", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 54.73, - "int8": 42.58, - "fp16": 31.51, + "int4": 64.5, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 18.27, - "int8": 23.48, - "fp16": 31.73, + "int4": 15.5, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -16485,33 +15540,33 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "llama-2-7b-chat", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 74.86, + "int4": 69.7, + "int8": "", "fp16": "", - "fp32": 6.39, - "bf16": 48.32 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 55.7, + "int4": 14.3, + "int8": "", "fp16": "", "fp32": "", - "bf16": 73.74 + "bf16": "" } ], "Unit": "ms", @@ -16520,31 +15575,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "mistral-7b-v0.1", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "llama-3-8b", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": "false", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 33.27, - "int8": 22.24, - "fp16": 15.74, + "int4": 72.1, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 30.05, - "int8": 44.96, - "fp16": 63.51, + "int4": 13.9, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -16555,33 +15610,33 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "llama-3.2-3b-instruct", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 39894.55, + "int4": 121.4, + "int8": "", "fp16": "", - "fp32": 15839.75, - "bf16": 29419.55 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 0.84, + "int4": 8.2, + "int8": "", "fp16": "", "fp32": "", - "bf16": 0.72 + "bf16": "" } ], "Unit": "ms", @@ -16590,18 +15645,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "phi-3-mini-4k-instruct", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 40.45, - "fp16": 26.95, + "int8": 31.77, + "fp16": 17.24, "fp32": "", "bf16": "" } @@ -16613,8 +15668,8 @@ "Precisions": [ { "int4": "", - "int8": 24.72, - "fp16": 37.1, + "int8": 43.68, + "fp16": "", "fp32": "", "bf16": "" } @@ -16625,31 +15680,31 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "qwen2-7b", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "mistral-7b-v0.1", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 35.48, - "int8": 25.7, - "fp16": 16.1, + "int4": 70.3, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 28.18, - "int8": 38.91, - "fp16": 62.09, + "int4": 14.2, + "int8": "", + "fp16": "", "fp32": "", "bf16": "" } @@ -16660,20 +15715,20 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "mobilenet-v2", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 21612.82, - "fp16": "", - "fp32": 2002.36, - "bf16": 13669.05 + "int8": 3947.72, + "fp16": 4159.12, + "fp32": "", + "bf16": "" } ], "Unit": "FPS", @@ -16683,10 +15738,10 @@ "Precisions": [ { "int4": "", - "int8": 1.0, + "int8": 0.39, "fp16": "", "fp32": "", - "bf16": 1.37 + "bf16": "" } ], "Unit": "ms", @@ -16695,29 +15750,29 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "phi-3-mini-4k-instruct", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 513.09, + "int4": 108.5, + "int8": "", "fp16": "", - "fp32": 35.2, - "bf16": 275.94 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", + "int4": 9.2, "int8": "", "fp16": "", "fp32": "", @@ -16730,33 +15785,33 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "qwen2-7b", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 26748.89, + "int4": 78.4, + "int8": "", "fp16": "", - "fp32": 4718.18, - "bf16": 16684.87 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 0.72, + "int4": 12.8, + "int8": "", "fp16": "", "fp32": "", - "bf16": 1.15 + "bf16": "" } ], "Unit": "ms", @@ -16765,18 +15820,18 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "stable-diffusion-v1-5", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "resnet-50", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", + "int8": 3375.63, + "fp16": 1964.87, "fp32": "", "bf16": "" } @@ -16788,8 +15843,8 @@ "Precisions": [ { "int4": "", - "int8": 4.09, - "fp16": 3.99, + "int8": 0.42, + "fp16": "", "fp32": "", "bf16": "" } @@ -16800,20 +15855,20 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "yolo11", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "ssd-resnet34-1200", "featured_SKU": true, - "whats_new_model": "false", - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", - "fp32": 1455.5, - "bf16": 2962.49 + "int8": 50.49, + "fp16": 59.89, + "fp32": "", + "bf16": "" } ], "Unit": "FPS", @@ -16823,10 +15878,10 @@ "Precisions": [ { "int4": "", - "int8": "", + "int8": 20.41, "fp16": "", "fp32": "", - "bf16": 3.19 + "bf16": "" } ], "Unit": "ms", @@ -16835,20 +15890,20 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8580 CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3043.23, - "fp16": "", - "fp32": 1258.2, - "bf16": 3444.22 + "int8": 2304.40, + "fp16": 2174.73, + "fp32": "", + "bf16": "" } ], "Unit": "FPS", @@ -16858,10 +15913,10 @@ "Precisions": [ { "int4": "", - "int8": 3.08, + "int8": 0.61, "fp16": "", "fp32": "", - "bf16": 2.56 + "bf16": "" } ], "Unit": "ms", @@ -16870,20 +15925,20 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "bert-base-cased", + "Platform": "Intel® Arc™ B-Series Graphics B580", + "Model": "yolo_v8n", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 8897.30, - "fp16": "", - "fp32": 1217.03, - "bf16": 6414.49 + "int8": 1127.03, + "fp16": 1076.39, + "fp32": "", + "bf16": "" } ], "Unit": "FPS", @@ -16893,9 +15948,9 @@ "Precisions": [ { "int4": "", - "int8": 7.74, + "int8": 1.25, "fp16": "", - "fp32": 14.8, + "fp32": "", "bf16": "" } ], @@ -16905,20 +15960,20 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "efficientdet-d0", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "bert-base-cased", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 3384.23, - "fp16": "", - "fp32": 2295.4, - "bf16": 2872.84 + "int8": 788.23, + "fp16": 689.29, + "fp32": "", + "bf16": "" } ], "Unit": "FPS", @@ -16928,9 +15983,9 @@ "Precisions": [ { "int4": "", - "int8": 9.71, + "int8": 1.43, "fp16": "", - "fp32": 9.43, + "fp32": "", "bf16": "" } ], @@ -16940,20 +15995,20 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "mask_rcnn_resnet50_atrous_coco", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "efficientdet-d0", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": 149.52, - "fp16": "", - "fp32": 11.97, - "bf16": 91.85 + "int8": 299.62, + "fp16": 298.77, + "fp32": "", + "bf16": "" } ], "Unit": "FPS", @@ -16963,9 +16018,9 @@ "Precisions": [ { "int4": "", - "int8": 74.6, + "int8": 5.98, "fp16": "", - "fp32": 248.21, + "fp32": "", "bf16": "" } ], @@ -16975,32 +16030,32 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "mobilenet-v2", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "gemma-2-9b", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 32737.09, + "int4": 38.6, + "int8": "", "fp16": "", - "fp32": 25621.92, - "bf16": 26297.21 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 1.65, + "int4": 25.9, + "int8": "", "fp16": "", - "fp32": 1.34, + "fp32": "", "bf16": "" } ], @@ -17010,32 +16065,32 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "resnet-50", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "glm-4-9b-chat", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": "false", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 27670.82, + "int4": 55.1, + "int8": "", "fp16": "", - "fp32": 4254.94, - "bf16": 22432.74 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 2.28, + "int4": 18.1, + "int8": "", "fp16": "", - "fp32": 3.69, + "fp32": "", "bf16": "" } ], @@ -17045,32 +16100,32 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "ssd-resnet34-1200", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "llama-2-7b-chat", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 1009.62, + "int4": 60.9, + "int8": "", "fp16": "", - "fp32": 77.99, - "bf16": 532.90 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 9.73, + "int4": 16.4, + "int8": "", "fp16": "", - "fp32": 34.1, + "fp32": "", "bf16": "" } ], @@ -17080,32 +16135,32 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "ssd_mobilenet_v1_coco", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "llama-3-8b", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": "false", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 29674.40, + "int4": 63.4, + "int8": "", "fp16": "", - "fp32": 9800.83, - "bf16": 19479.18 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 2.08, + "int4": 15.8, + "int8": "", "fp16": "", - "fp32": 2.45, + "fp32": "", "bf16": "" } ], @@ -17115,32 +16170,32 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "yolo_v8n", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "llama-3.2-3b-instruct", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": "", - "int8": 5590.87, + "int4": 110.3, + "int8": "", "fp16": "", - "fp32": 2699.0, - "bf16": 6003.66 + "fp32": "", + "bf16": "" } ], - "Unit": "FPS", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": "", - "int8": 6.56, + "int4": 9.1, + "int8": "", "fp16": "", - "fp32": 5.59, + "fp32": "", "bf16": "" } ], @@ -17150,31 +16205,31 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "gemma-2-9b", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "mask_rcnn_resnet50_atrous_coco", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 136.4, - "int8": "", - "fp16": 53.6, + "int4": "", + "int8": 26.85, + "fp16": 15.06, "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "FPS", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 7.3, - "int8": "", - "fp16": 18.7, + "int4": "", + "int8": 49.87, + "fp16": "", "fp32": "", "bf16": "" } @@ -17185,31 +16240,31 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "glm-4-9b-chat", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "mistral-7b-v0.1", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 116.5, + "int4": 62.7, "int8": "", - "fp16": 51.9, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 8.6, + "int4": 16.0, "int8": "", - "fp16": 19, + "fp16": "", "fp32": "", "bf16": "" } @@ -17220,31 +16275,31 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "llama-2-7b-chat", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "mobilenet-v2", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 139.5, - "int8": "", - "fp16": 132, + "int4": "", + "int8": 4116.03, + "fp16": 3936.74, "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "FPS", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 7.2, - "int8": "", - "fp16": 7.6, + "int4": "", + "int8": 0.40, + "fp16": "", "fp32": "", "bf16": "" } @@ -17255,32 +16310,32 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "llama-3.2-3b-instruct", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "phi-3-mini-4k-instruct", "featured_SKU": true, "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 272.7, - "int8": 65, + "int4": 96.3, + "int8": "", "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 3.7, + "int4": 10.4, "int8": "", "fp16": "", - "fp32": 15.4, + "fp32": "", "bf16": "" } ], @@ -17290,31 +16345,31 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "llama-3-8b", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "qwen2-7b", "featured_SKU": true, - "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": true, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 148.2, + "int4": 68.2, "int8": "", - "fp16": 57.2, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "Tokens per Sec", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 6.7, + "int4": 14.7, "int8": "", - "fp16": 17.5, + "fp16": "", "fp32": "", "bf16": "" } @@ -17325,31 +16380,31 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "mistral-7b-v0.1", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "resnet-50", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 126.4, - "int8": "", - "fp16": 61.4, + "int4": "", + "int8": 2830.65, + "fp16": 1851.03, "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "FPS", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 7.9, - "int8": "", - "fp16": 16.3, + "int4": "", + "int8": 0.45, + "fp16": "", "fp32": "", "bf16": "" } @@ -17360,31 +16415,31 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "phi-3-mini-4k-instruct", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "ssd-resnet34-1200", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 176.6, - "int8": "", - "fp16": 111.9, + "int4": "", + "int8": 57.74, + "fp16": 47.88, "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "FPS", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 5.7, - "int8": "", - "fp16": 8.9, + "int4": "", + "int8": 21.12, + "fp16": "", "fp32": "", "bf16": "" } @@ -17395,31 +16450,31 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "qwen2-7b", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "ssd_mobilenet_v1_coco", "featured_SKU": true, - "whats_new_model": true, - "PlatformType": "Intel® Xeon®, CPU-only", + "whats_new_model": false, + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { - "int4": 164.4, - "int8": "", - "fp16": 62.2, + "int4": "", + "int8": 2202.46, + "fp16": 2063.27, "fp32": "", "bf16": "" } ], - "Unit": "Tokens/sec", + "Unit": "FPS", "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { - "int4": 6.1, - "int8": "", - "fp16": 16.1, + "int4": "", + "int8": 0.63, + "fp16": "", "fp32": "", "bf16": "" } @@ -17430,36 +16485,36 @@ } }, { - "Platform": "Intel® Xeon® 6979P CPU-only", - "Model": "stable-diffusion-v1-5", + "Platform": "Intel® Arc™ B-Series Graphics B570", + "Model": "yolo_v8n", "featured_SKU": true, "whats_new_model": false, - "PlatformType": "Intel® Xeon®, CPU-only", + "PlatformType": "Accelerator Platforms", "Parameters": { "throughput": { "Precisions": [ { "int4": "", - "int8": "", - "fp16": "", + "int8": 946.02, + "fp16": 943.72, "fp32": "", "bf16": "" } ], - "Unit": "n/a", - "UnitDesc": "n/a" + "Unit": "FPS", + "UnitDesc": "higher is better" }, "latency": { "Precisions": [ { "int4": "", - "int8": 4.0, - "fp16": 4.1, + "int8": 1.47, + "fp16": "", "fp32": "", "bf16": "" } ], - "Unit": "Image gen. time in sec.", + "Unit": "ms", "UnitDesc": "lower is better" } } diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json index f96fb11e6b029d..0de8f188e7de34 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json +++ b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json @@ -1,45 +1,330 @@ [ + { + "Platform": "Intel® Xeon® Platinum 8380", + "Model": "meta-llama/Llama-2-7b-chat-hf", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "Ovms": { + "Precisions": [ + { + "Throughput": { + "0.2": 94.97, + "0.4": 187.12, + "0.6": 271.85, + "0.8": 290.81, + "1.0": 291.39, + "2.0": 291.45, + "inf": 291.59 + }, + "Latency": { + "0.2": 74.35, + "0.4": 122.25, + "0.6": 467.49, + "0.8": 749.39, + "1.0": 771.39, + "2.0": 773.31, + "inf": 783.63 + } + } + ] + }, + "Vllm": { + "Precisions": [ + { + "Throughput": { + "0.2": 94.83, + "0.4": 187.83, + "0.6": 272.32, + "0.8": 284.07, + "1.0": 291.88, + "2.0": 291.91, + "inf": 288.62 + }, + "Latency": { + "0.2": 82.31, + "0.4": 134.38, + "0.6": 495.99, + "0.8": 794.41, + "1.0": 798.39, + "2.0": 800.33, + "inf": 809.56 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8480+", + "Model": "meta-llama/Llama-2-7b-chat-hf", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "Ovms": { + "Precisions": [ + { + "Throughput": { + "0.2": 95.15, + "0.4": 188.31, + "0.6": 279.3, + "0.8": 366.78, + "1.0": 454.27, + "2.0": 788.9, + "inf": 825.97 + }, + "Latency": { + "0.2": 60.88, + "0.4": 71.96, + "0.6": 83.45, + "0.8": 103.77, + "1.0": 128.12, + "2.0": 237.62, + "inf": 253.59 + } + } + ] + }, + "Vllm": { + "Precisions": [ + { + "Throughput": { + "0.2": 95.06, + "0.4": 188.47, + "0.6": 280.54, + "0.8": 367.47, + "1.0": 450.81, + "2.0": 774.57, + "inf": 793.78 + }, + "Latency": { + "0.2": 63.84, + "0.4": 76.22, + "0.6": 87.21, + "0.8": 104.75, + "1.0": 136.77, + "2.0": 259.2, + "inf": 273.58 + } + } + ] + } + } + }, { "Platform": "Intel® Xeon® Platinum 8580", - "Model": "mistralai/Mistral-7B-v0.1", - "PlatformType": "None", + "Model": "meta-llama/Llama-2-7b-chat-hf", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { + "Ovms": { + "Precisions": [ + { + "Throughput": { + "0.2": 95.29, + "0.4": 188.33, + "0.6": 280.09, + "0.8": 367.29, + "1.0": 453.21, + "2.0": 780.05, + "inf": 751.34 + }, + "Latency": { + "0.2": 52.44, + "0.4": 70.06, + "0.6": 84.54, + "0.8": 108.91, + "1.0": 136.45, + "2.0": 253.55, + "inf": 281.85 + } + } + ] + }, "Vllm": { "Precisions": [ { "Throughput": { - "0.2": "350.06", - "0.6": "486.89", - "0.8": "575.92", - "2.0": "778.07" + "0.2": 95.0, + "0.4": 188.26, + "0.6": 279.78, + "0.8": 366.69, + "1.0": 450.26, + "2.0": 770.74, + "inf": 794.39 + }, + "Latency": { + "0.2": 58.07, + "0.4": 77.65, + "0.6": 91.14, + "0.8": 113.61, + "1.0": 144.21, + "2.0": 269.13, + "inf": 273.27 } - }, + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8380", + "Model": "meta-llama/Meta-Llama-3-8B-Instruct", + "featured_SKU": false, + "whats_new_model": true, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "Ovms": { + "Precisions": [ { + "Throughput": { + "0.2": 82.46, + "0.4": 162.73, + "0.6": 240.08, + "0.8": 273.75, + "1.0": 275.85, + "2.0": 276.3, + "inf": 275.15 + }, "Latency": { - "0.2": "60.93", - "0.6": "91.63", - "0.8": "113.61", - "2.0": "240.25" + "0.2": 76.49, + "0.4": 122.1, + "0.6": 318.14, + "0.8": 785.8, + "1.0": 805.58, + "2.0": 809.37, + "inf": 816.2 } } ] }, + "Vllm": { + "Precisions": [ + { + "Throughput": { + "0.2": 82.32, + "0.4": 162.98, + "0.6": 239.28, + "2.0": 270.37 + }, + "Latency": { + "0.2": 87.92, + "0.4": 142.3, + "0.6": 343.36, + "2.0": 873.0 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8480+", + "Model": "meta-llama/Meta-Llama-3-8B-Instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "Ovms": { + "Precisions": [ + { + "Throughput": { + "0.2": 82.61, + "0.4": 164.44, + "0.6": 244.92, + "0.8": 323.34, + "1.0": 400.78, + "2.0": 731.9, + "inf": 848.45 + }, + "Latency": { + "0.2": 60.77, + "0.4": 69.1, + "0.6": 74.36, + "0.8": 81.41, + "1.0": 100.17, + "2.0": 206.5, + "inf": 246.56 + } + } + ] + }, + "Vllm": { + "Precisions": [ + { + "Throughput": { + "0.2": 82.54, + "0.4": 163.66, + "0.6": 243.88, + "0.8": 322.75, + "1.0": 400.46, + "2.0": 727.1 + }, + "Latency": { + "0.2": 65.37, + "0.4": 75.87, + "0.6": 81.14, + "0.8": 93.91, + "1.0": 107.13, + "2.0": 229.57 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8580", + "Model": "meta-llama/Meta-Llama-3-8B-Instruct", + "featured_SKU": true, + "whats_new_model": true, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { "Ovms": { "Precisions": [ { "Throughput": { - "0.2": "90.98", - "0.6": "266.24", - "0.8": "351.63", - "2.0": "195.16" + "0.2": 82.55, + "0.4": 164.52, + "0.6": 243.96, + "0.8": 323.07, + "1.0": 399.68, + "2.0": 727.18, + "inf": 856.72 + }, + "Latency": { + "0.2": 54.57, + "0.4": 69.17, + "0.6": 80.32, + "0.8": 92.94, + "1.0": 111.06, + "2.0": 215.46, + "inf": 245.72 } - }, + } + ] + }, + "Vllm": { + "Precisions": [ { + "Throughput": { + "0.2": 82.64, + "0.6": 243.81, + "0.8": 321.8, + "1.0": 398.78, + "2.0": 722.48, + "inf": 792.34 + }, "Latency": { - "0.2": "54.9", - "0.6": "78.78", - "0.8": "95.78", - "2.0": "352.23" + "0.2": 61.49, + "0.6": 90.54, + "0.8": 106.25, + "1.0": 123.6, + "2.0": 245.91, + "inf": 279.21 } } ] @@ -47,46 +332,168 @@ } }, { - "Platform": "Intel® Xeon® Platinum 8530", + "Platform": "Intel® Xeon® Platinum 8380", "Model": "mistralai/Mistral-7B-v0.1", - "PlatformType": "None", + "featured_SKU": false, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { + "Ovms": { + "Precisions": [ + { + "Throughput": { + "0.2": 91.74, + "0.4": 180.4, + "0.6": 262.97, + "0.8": 287.36, + "1.0": 289.08, + "2.0": 289.06, + "inf": 290.69 + }, + "Latency": { + "0.2": 74.84, + "0.4": 115.4, + "0.6": 345.64, + "0.8": 757.42, + "1.0": 776.6, + "2.0": 778.29, + "inf": 784.42 + } + } + ] + }, "Vllm": { "Precisions": [ { "Throughput": { - "0.2": "350.06", - "0.6": "486.89", - "0.8": "575.92", - "2.0": "778.07" + "0.2": 97.21, + "0.4": 192.46, + "0.6": 265.82, + "0.8": 273.24, + "1.0": 272.65, + "inf": 274.0 + }, + "Latency": { + "0.2": 166.77, + "0.4": 161.76, + "0.6": 666.89, + "0.8": 802.15, + "1.0": 810.26, + "inf": 807.71 } - }, + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8480+", + "Model": "mistralai/Mistral-7B-v0.1", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { + "Ovms": { + "Precisions": [ { + "Throughput": { + "0.2": 90.95, + "0.4": 181.06, + "0.6": 267.29, + "0.8": 351.62, + "1.0": 431.45, + "2.0": 751.85, + "inf": 596.0 + }, "Latency": { - "0.2": "60.93", - "0.6": "91.63", - "0.8": "113.61", - "2.0": "240.25" + "0.2": 59.95, + "0.4": 63.41, + "0.6": 73.42, + "0.8": 85.99, + "1.0": 98.67, + "2.0": 205.2, + "inf": 205.97 } } ] }, + "Vllm": { + "Precisions": [ + { + "Throughput": { + "0.2": 98.18, + "0.4": 194.35, + "0.6": 287.28, + "0.8": 376.31, + "1.0": 460.32, + "2.0": 771.81, + "inf": 789.38 + }, + "Latency": { + "0.2": 64.88, + "0.4": 73.3, + "0.6": 84.37, + "0.8": 100.8, + "1.0": 133.98, + "2.0": 240.99, + "inf": 251.55 + } + } + ] + } + } + }, + { + "Platform": "Intel® Xeon® Platinum 8580", + "Model": "mistralai/Mistral-7B-v0.1", + "featured_SKU": true, + "whats_new_model": false, + "PlatformType": "Server Platforms (Intel® Xeon®)", + "Parameters": { "Ovms": { "Precisions": [ { "Throughput": { - "0.2": "90.98", - "0.6": "266.24", - "0.8": "351.63", - "2.0": "195.16" + "0.2": 91.2, + "0.4": 180.14, + "0.6": 267.75, + "0.8": 351.12, + "1.0": 428.31, + "2.0": 744.99, + "inf": 852.05 + }, + "Latency": { + "0.2": 54.31, + "0.4": 67.14, + "0.6": 77.59, + "0.8": 92.17, + "1.0": 112.75, + "2.0": 225.48, + "inf": 241.49 } - }, + } + ] + }, + "Vllm": { + "Precisions": [ { + "Throughput": { + "0.2": 98.1, + "0.4": 194.47, + "0.6": 286.97, + "0.8": 375.84, + "1.0": 460.21, + "2.0": 764.54, + "inf": 787.97 + }, "Latency": { - "0.2": "54.9", - "0.6": "78.78", - "0.8": "95.78", - "2.0": "352.23" + "0.2": 62.26, + "0.4": 78.08, + "0.6": 91.61, + "0.8": 116.71, + "1.0": 141.76, + "2.0": 250.38, + "inf": 254.25 } } ] diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv index 0d16c5c4998329..9481b5619244e2 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv @@ -1,156 +1,96 @@ Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec -opt-125m-gptq,INT4-MIXED,32,965.9,29,7.7,129.87 -opt-125m-gptq,INT4-MIXED,1024,1507.9,113.1,7.8,128.21 -tiny-llama-1.1b-chat,INT4-MIXED,32,1831.8,46.5,16.7,59.88 -tiny-llama-1.1b-chat,INT4-MIXED,1024,1806.3,635,17.8,56.18 -qwen2-0.5b,INT4-MIXED,32,2551.7,61.4,18.3,54.64 -qwen2-0.5b,INT4-MIXED,1024,2976.6,356.1,19.2,52.08 -tiny-llama-1.1b-chat,INT8-CW,32,1987.4,56,21.6,46.30 -tiny-llama-1.1b-chat,INT8-CW,1024,2209.1,772.7,22.6,44.25 -qwen2-0.5b,INT8-CW,32,2484.9,57.3,22.8,43.86 -qwen2-0.5b,INT8-CW,1024,3102.5,407.1,23.9,41.84 -qwen2-1.5b,INT4-MIXED,32,4265.2,71.7,25.5,39.22 -qwen2-1.5b,INT4-MIXED,1024,4884.5,862.4,26.8,37.31 -dolly-v2-3b,INT4-MIXED,32,2401.3,89.6,27.5,36.36 -red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2511.5,78.6,28.2,35.46 -phi-2,INT4-MIXED,32,2279.5,95.7,29.1,34.36 -minicpm-1b-sft,INT4-MIXED,31,2759.9,104.4,30.9,32.36 -phi-2,INT4-MIXED,32,2620.1,100.8,31,32.26 -stable-zephyr-3b-dpo,INT4-MIXED,30,2636.5,86.8,31.7,31.55 -dolly-v2-3b,INT4-MIXED,1024,3137.1,1782.9,32.2,31.06 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,3118.5,1831.7,33.3,30.03 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1024,2862.7,1821.1,33.5,29.85 -qwen2-1.5b,INT8-CW,32,4831.2,87,33.8,29.59 -opt-2.7b,INT4-MIXED,31,2898.3,73.2,33.9,29.50 -phi-2,INT4-MIXED,1024,2797.4,1887,34,29.41 -orca-mini-3b,INT4-MIXED,32,2877.8,100.3,35,28.57 -stablelm-3b-4e1t,INT4-MIXED,32,2669.4,94.7,35.3,28.33 -qwen2-1.5b,INT8-CW,1024,5455.8,1047.6,35.3,28.33 -minicpm-1b-sft,INT8-CW,31,3104.1,103.5,35.3,28.33 -phi-2,INT4-MIXED,1024,3039.8,1917.4,35.9,27.86 -stable-zephyr-3b-dpo,INT4-MIXED,946,3411.4,1695,37,27.03 -gemma-2b-it,INT4-MIXED,32,3991.7,116.1,37.9,26.39 -opt-2.7b,INT4-MIXED,937,3617.5,1764.9,38.2,26.18 -phi-3-mini-4k-instruct,INT4-MIXED,31,2935.3,111.6,38.2,26.18 -phi-3-mini-4k-instruct,INT4-MIXED,38,3102.4,134,38.4,26.04 -phi-3-mini-4k-instruct,INT4-MIXED,31,2986.1,114.1,38.9,25.71 -phi-3-mini-4k-instruct,INT4-MIXED,38,2977.4,131.1,39,25.64 -gemma-2b-it,INT4-MIXED,1024,4973.3,1249.2,39.7,25.19 -stablelm-3b-4e1t,INT4-MIXED,1024,3196.9,2045.4,39.9,25.06 -dolly-v2-3b,INT8-CW,32,3490.2,107.4,41.5,24.10 -red-pajama-incite-chat-3b-v1,INT8-CW,32,3457.9,105,42.5,23.53 -opt-2.7b,INT8-CW,31,3686.8,107.5,44.1,22.68 -phi-2,INT8-CW,32,3554.9,116.6,44.1,22.68 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3390.7,2277.1,44.2,22.62 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3643.6,2485,44.4,22.52 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3516.4,2280.9,44.5,22.47 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3537.2,2522.4,44.7,22.37 -orca-mini-3b,INT4-MIXED,1024,3557.3,1898.9,45,22.22 -minicpm-1b-sft,FP16,31,3814.4,97.9,45.4,22.03 -stablelm-3b-4e1t,INT8-CW,32,3486.9,100.5,46.1,21.69 -stable-zephyr-3b-dpo,INT8-CW,30,3516.7,101.9,46.1,21.69 -dolly-v2-3b,INT8-CW,1024,4265.9,2178.6,46.2,21.65 -red-pajama-incite-chat-3b-v1,INT8-CW,1020,3979.1,2219.7,47.2,21.19 -red-pajama-incite-chat-3b-v1,INT8-CW,1024,3975.5,2199.7,47.3,21.14 -opt-2.7b,INT8-CW,937,4358.6,1981.8,48.4,20.66 -phi-2,INT8-CW,1024,4058.1,2280.1,48.9,20.45 -gemma-2b-it,INT8-CW,32,4786.8,119.8,49.4,20.24 -chatglm3-6b,INT4-MIXED,32,4141.5,166.6,49.7,20.12 -stablelm-3b-4e1t,INT8-CW,1024,4054.8,2243.5,50.7,19.72 -stable-zephyr-3b-dpo,INT8-CW,946,4521.8,1816.4,51.3,19.49 -gemma-2b-it,INT8-CW,1024,5810.7,1580,51.3,19.49 -chatglm3-6b,INT4-MIXED,32,4651.4,164.7,51.6,19.38 -chatglm3-6b,INT4-MIXED,1024,4235.1,2818.7,52.3,19.12 -orca-mini-3b,INT8-CW,32,4162,109.2,53.3,18.76 -chatglm3-6b,INT4-MIXED,1024,4783.8,2869,54.4,18.38 -gpt-j-6b,INT4-MIXED,32,4667.3,176.7,56.3,17.76 -chatglm3-6b-gptq,INT4-MIXED,32,5369.4,173.9,58.9,16.98 -llama-2-7b-chat-hf,INT4-MIXED,32,4280,173.2,60.1,16.64 -phi-3-mini-4k-instruct,INT8-CW,31,4585.1,123,60.5,16.53 -phi-3-mini-4k-instruct,INT8-CW,38,4597,152,60.5,16.53 -chatglm2-6b,INT4-MIXED,32,4847.8,158.7,60.6,16.50 -vicuna-7b-v1.5,INT4-MIXED,32,4476.9,178.2,61.2,16.34 -chatglm3-6b-gptq,INT4-MIXED,1024,5217.6,2863.7,61.3,16.31 -mistral-7b-v0.1,INT4-MIXED,31,4413.6,194,61.7,16.21 -qwen2-7b,INT4-MIXED,32,7044.7,184.4,61.7,16.21 -mistral-7b-v0.1,INT4-MIXED,32,4427.6,193.3,61.8,16.18 -orca-mini-3b,INT8-CW,1024,4821.6,2239.1,62,16.13 -codegen25-7b,INT4-MIXED,32,4687.2,176.2,62.7,15.95 -chatglm2-6b,INT4-MIXED,1024,5165.9,3148,63,15.87 -llama-2-7b-gptq,INT4-MIXED,32,4632.8,175.2,63.4,15.77 -stablelm-7b,INT4-MIXED,32,5219.5,206.3,63.4,15.77 -qwen-7b-chat,INT4-MIXED,32,7805.6,193.8,63.6,15.72 -gpt-j-6b,INT4-MIXED,1024,5314.9,3111.8,63.6,15.72 -qwen2-7b,INT4-MIXED,1024,7716.2,3548.3,64.1,15.60 -llama-3-8b,INT4-MIXED,32,4910.9,204.8,64.7,15.46 -mistral-7b-v0.1,INT4-MIXED,1024,4720.8,3667.1,64.8,15.43 -mistral-7b-v0.1,INT4-MIXED,1007,4704.7,3685.4,64.9,15.41 -llama-3.1-8b,INT4-MIXED,31,4850.3,211.5,64.9,15.41 -phi-3-mini-4k-instruct,INT8-CW,1023,5128.6,2815.2,65.7,15.22 -phi-3-mini-4k-instruct,INT8-CW,1061,5155,3407.9,65.9,15.17 -mistral-7b-v0.1,INT4-MIXED,32,4939.3,192,66.5,15.04 -llama-3-8b,INT4-MIXED,33,4919.4,261.9,67.2,14.88 -llama-2-7b-chat-hf,INT4-MIXED,1024,4948.2,3811,67.3,14.86 -qwen1.5-7b-chat,INT4-MIXED,32,5943.1,180.5,67.7,14.77 -qwen-7b-chat-gptq,INT4-MIXED,32,8057,187,68.1,14.68 -llama-3-8b,INT4-MIXED,32,5503.5,198.4,68.1,14.68 -qwen-7b-chat,INT4-MIXED,32,8091.6,185.9,68.1,14.68 -llama-3-8b,INT4-MIXED,1024,5569.1,3920.5,68.2,14.66 -llama-3.1-8b,INT4-MIXED,31,5358.6,201,68.2,14.66 -stablelm-7b,INT4-MIXED,1020,5804.4,3726.6,68.8,14.53 -llama-3.1-8b,INT4-MIXED,31,5452.6,202.9,68.8,14.53 -llama-2-7b-chat-hf,INT4-MIXED,32,5023,165.7,69,14.49 -llama-3-8b,INT4-MIXED,32,5413.6,202,69.1,14.47 -llama-3-8b,INT4-MIXED,33,5440.4,262.1,69.2,14.45 -codegen25-7b,INT4-MIXED,1024,5434.6,3513.2,69.9,14.31 -mistral-7b-v0.1,INT4-MIXED,1024,5614.9,3819.1,70,14.29 -mistral-7b-v0.1,INT4-MIXED,31,4927.8,205,70.5,14.18 -llama-3-8b,INT4-MIXED,33,5498.9,270.7,70.6,14.16 -llama-3-8b,INT4-MIXED,1025,5577.4,4271.2,70.6,14.16 -llama-2-7b-gptq,INT4-MIXED,1024,5302.2,3529.4,70.7,14.14 -zephyr-7b-beta,INT4-MIXED,32,5212.4,190.6,71.2,14.04 -llama-3-8b,INT4-MIXED,1024,6161.1,3918,71.5,13.99 -llama-3-8b,INT4-MIXED,1025,6098,4441.8,72.3,13.83 -llama-3-8b,INT4-MIXED,1024,6071.7,3972.2,72.4,13.81 -mistral-7b-v0.1,INT4-MIXED,1007,5224.1,4153.4,73.8,13.55 -llama-3-8b,INT4-MIXED,1025,6156.9,4357,73.9,13.53 -zephyr-7b-beta,INT4-MIXED,1024,5511.6,3978,74.4,13.44 -opt-2.7b,FP16,31,9220.3,107.8,74.7,13.39 -dolly-v2-3b,FP16,32,6058.9,109.9,74.7,13.39 -qwen1.5-7b-chat,INT4-MIXED,1024,7063.2,3791.7,75,13.33 -qwen-7b-chat,INT4-MIXED,1024,8919.5,3763.9,75,13.33 -red-pajama-incite-chat-3b-v1,FP16,32,6036.5,107.5,75.9,13.18 -llama-2-7b-chat-hf,INT4-MIXED,1024,5716.8,4231.7,76.2,13.12 -phi-2,FP16,32,6090.1,115.2,77.1,12.97 -stable-zephyr-3b-dpo,FP16,30,6113.1,112.1,78.6,12.72 -qwen-7b-chat,INT4-MIXED,1024,9212.9,3857.4,78.6,12.72 -stablelm-3b-4e1t,FP16,32,6065.4,110.2,78.7,12.71 -opt-2.7b,FP16,937,9733.8,3750.8,78.8,12.69 -dolly-v2-3b,FP16,1024,6615.2,2230.9,79.1,12.64 -red-pajama-incite-chat-3b-v1,FP16,1020,6588.3,2259.4,80.2,12.47 -glm-4-9b,INT4-MIXED,33,6386.2,328,80.4,12.44 -red-pajama-incite-chat-3b-v1,FP16,1024,6570.3,2268.7,80.4,12.44 -baichuan2-7b-chat,INT4-MIXED,32,5977.9,201.7,81,12.35 -glm-4-9b,INT4-MIXED,32,6389.7,248.1,81,12.35 -phi-2,FP16,1024,6646.2,2406.7,81.4,12.29 -stable-zephyr-3b-dpo,FP16,946,6875.7,1868.2,82.9,12.06 -stablelm-3b-4e1t,FP16,1024,6636.1,2036.9,83,12.05 -chatglm2-6b,INT8-CW,32,6731.8,159.2,84.4,11.85 -glm-4-9b,INT4-MIXED,1025,7061.4,4939.2,85.2,11.74 -qwen-7b-chat-gptq,INT4-MIXED,1024,9175.3,3898,85.3,11.72 -gemma-7b-it,INT4-MIXED,32,7883.9,230.5,86,11.63 -gemma-7b-it,INT4-MIXED,32,8002.6,235,86.1,11.61 -glm-4-9b,INT4-MIXED,1024,7064.9,4411.2,86.2,11.60 -gpt-j-6b,INT8-CW,32,7009.2,176.8,86.4,11.57 -chatglm2-6b,INT8-CW,1024,7050.5,3871.6,86.8,11.52 -chatglm3-6b,INT8-CW,32,6755.9,159,86.8,11.52 -baichuan2-7b-chat,INT4-MIXED,1024,7033.3,4049,88.8,11.26 -chatglm3-6b,INT8-CW,1024,7076.5,3865.9,89.2,11.21 -qwen-7b-chat,INT4-MIXED,32,9245.7,176.3,90,11.11 -gemma-7b-it,INT4-MIXED,1024,9449.4,4305.8,93.2,10.73 -gpt-j-6b,INT8-CW,1024,7672.3,4181.1,93.5,10.70 -gemma-7b-it,INT4-MIXED,1024,9330.5,4222.5,93.7,10.67 -orca-mini-3b,FP16,32,7416.5,122.3,94.7,10.56 -codegen25-7b,INT8-CW,32,7557.6,170.7,98.4,10.16 -qwen-7b-chat,INT4-MIXED,1024,10371.1,4271.7,98.9,10.11 -llama-2-7b-chat-hf,INT8-CW,32,7390.6,171.6,99.9,10.01 +opt-125m-gptq,INT4-MIXED,32,1116,25.8,8.1,123.5 +opt-125m-gptq,INT4-MIXED,1024,1187.1,75.2,8.2,122.0 +qwen2-0.5b,INT4-MIXED,32,1587.4,45.1,15.4,64.9 +qwen2-0.5b,INT4-MIXED,1024,1587.8,228.2,15.6,64.1 +tiny-llama-1.1b-chat,INT4-MIXED,32,1704.2,42.4,17.6,56.8 +tiny-llama-1.1b-chat,INT4-MIXED,1024,1616.3,489.2,18.9,52.9 +qwen2-0.5b,INT8-CW,32,1477.3,51.5,20.2,49.5 +qwen2-0.5b,INT8-CW,1024,1592,263.7,20.6,48.5 +tiny-llama-1.1b-chat,INT8-CW,32,1855.6,60.2,20.7,48.3 +tiny-llama-1.1b-chat,INT8-CW,1024,1992.6,618.2,21.7,46.1 +qwen2-1.5b,INT4-MIXED,32,2024.2,59.6,23.1,43.3 +bloomz-560m,FP16,1024,2773.1,647.8,23.8,42.0 +qwen2-1.5b,INT4-MIXED,1024,2177.7,577.4,23.8,42.0 +bloomz-560m,FP16,32,2582.7,44.2,25.1,39.8 +dolly-v2-3b,INT4-MIXED,32,2507.9,79.8,29.4,34.0 +phi-2,INT4-MIXED,32,2568.9,74.6,29.7,33.7 +qwen2-1.5b,INT8-CW,32,2577.3,81.6,30.5,32.8 +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2489.4,69.9,30.5,32.8 +minicpm-1b-sft,INT4-MIXED,31,2442.1,84.7,31,32.3 +qwen2-1.5b,INT8-CW,1024,2739.8,773.3,31.2,32.1 +gemma-2b-it,INT4-MIXED,32,2998.2,103.5,31.4,31.8 +dolly-v2-3b,INT4-MIXED,1024,2508.1,1396.6,32,31.3 +gemma-2b-it,INT4-MIXED,1024,3171.5,822.3,32.2,31.1 +phi-2,INT4-MIXED,1024,2940.5,1395.3,32.2,31.1 +red-pajama-incite-chat-3b-v1,INT4-MIXED,1023,2489.6,1435.5,33.1,30.2 +minicpm-1b-sft,INT8-CW,31,2818.6,86.9,33.4,29.9 +stable-zephyr-3b-dpo,INT4-MIXED,32,2638.2,87.4,33.8,29.6 +stablelm-3b-4e1t,INT4-MIXED,32,2750.5,89.4,35.6,28.1 +stablelm-3b-4e1t,INT4-MIXED,1023,3115.5,1473.1,38.1,26.2 +phi-3-mini-4k-instruct,INT4-MIXED,32,3039.1,109.2,40.4,24.8 +phi-2,INT8-CW,32,3599.7,107.5,42.1,23.8 +gemma-2b-it,INT8-CW,32,3845.4,111.3,42.2,23.7 +dolly-v2-3b,INT8-CW,32,3596.4,110.1,42.5,23.5 +gemma-2b-it,INT8-CW,1024,3844.6,1183,43,23.3 +red-pajama-incite-chat-3b-v1,INT8-CW,32,3590,111,43.3,23.1 +phi-3-mini-4k-instruct,INT4-MIXED,1024,3467.6,1721.6,43.5,23.0 +stablelm-3b-4e1t,INT8-CW,32,3582.8,111,44.3,22.6 +stable-zephyr-3b-dpo,INT8-CW,32,3607.2,110.2,44.5,22.5 +phi-2,INT8-CW,1024,3982,1508,44.6,22.4 +dolly-v2-3b,INT8-CW,1024,3596.5,1529.1,44.9,22.3 +minicpm-1b-sft,FP16,31,3769.9,84,45.4,22.0 +red-pajama-incite-chat-3b-v1,INT8-CW,1023,3952,2064.5,45.7,21.9 +stablelm-3b-4e1t,INT8-CW,1023,3934.5,2286.3,46.8,21.4 +gpt-j-6b,INT4-MIXED,32,4443.5,159.3,56.7,17.6 +phi-3-mini-4k-instruct,INT8-CW,32,4545,117.1,57.6,17.4 +phi-3-mini-4k-instruct,INT8-CW,1024,4810.4,2068.8,60.5,16.5 +gpt-j-6b,INT4-MIXED,1024,4746.4,2397,60.6,16.5 +falcon-7b-instruct,INT4-MIXED,32,5014,203.7,61.3,16.3 +qwen2-7b,INT4-MIXED,32,5269.4,203.8,62.3,16.1 +codegen25-7b,INT4-MIXED,32,4641.1,170.6,63.5,15.7 +llama-2-7b-gptq,INT4-MIXED,32,4597.3,172.1,63.5,15.7 +falcon-7b-instruct,INT4-MIXED,1024,5230.6,2695.3,63.6,15.7 +qwen2-7b,INT4-MIXED,1024,5370.8,2505.9,63.9,15.6 +decilm-7b-instruct,INT4-MIXED,36,4614.2,301.1,65.3,15.3 +codegen25-7b,INT4-MIXED,1024,4641.9,2629.6,67.4,14.8 +llama-2-7b-gptq,INT4-MIXED,1024,4928.1,2584.3,67.6,14.8 +mistral-7b-v0.1,INT4-MIXED,32,4928.5,180.9,69.2,14.5 +llama-2-7b-chat-hf,INT4-MIXED,32,4985.7,160.3,69.5,14.4 +qwen-7b-chat-gptq,INT4-MIXED,32,5426.7,188.3,69.5,14.4 +llama-3-8b,INT4-MIXED,33,5473.4,285.7,70,14.3 +flan-t5-xxl,INT4-MIXED,33,19293.8,211.7,70.1,14.3 +llama-3-8b,INT4-MIXED,33,5389.2,281,70.8,14.1 +mistral-7b-v0.1,INT4-MIXED,1024,5225.4,2713.3,71.8,13.9 +zephyr-7b-beta,INT4-MIXED,32,5306.1,177.9,72.1,13.9 +llama-3-8b,INT4-MIXED,1025,5615.2,2937.8,72.4,13.8 +llama-3-8b,INT4-MIXED,1025,5531.7,2815.4,73.2,13.7 +llama-2-7b-chat-hf,INT4-MIXED,1024,5319.5,2736.2,73.6,13.6 +phi-2,FP16,32,6197,104.6,74.7,13.4 +zephyr-7b-beta,INT4-MIXED,1024,5306.4,2802.3,74.7,13.4 +qwen-7b-chat-gptq,INT4-MIXED,1024,5934.9,2606.9,75,13.3 +dolly-v2-3b,FP16,32,6195.1,105.3,75.3,13.3 +baichuan2-7b-chat,INT4-MIXED,32,5837.9,188.5,76.8,13.0 +red-pajama-incite-chat-3b-v1,FP16,32,6178.6,118,76.8,13.0 +gemma-7b-it,INT4-MIXED,32,6495.9,230.6,77,13.0 +stablelm-3b-4e1t,FP16,32,6174.2,105.9,77.1,13.0 +stable-zephyr-3b-dpo,FP16,32,6217.8,107.9,77.2,13.0 +glm-4-9b-chat,INT4-MIXED,32,6333.4,225,77.3,12.9 +phi-2,FP16,1024,6411.5,2065.2,77.3,12.9 +dolly-v2-3b,FP16,1024,6410.1,2075,77.7,12.9 +llama-3.1-8b,INT4-MIXED,32,6324.6,182.2,78.8,12.7 +red-pajama-incite-chat-3b-v1,FP16,1023,6394.2,2752.4,79.2,12.6 +stablelm-3b-4e1t,FP16,1023,6386.9,2953.3,79.5,12.6 +glm-4-9b-chat,INT4-MIXED,1024,6439.5,3282.2,80,12.5 +baichuan2-7b-chat,INT4-MIXED,1024,6174.1,2752.6,80.6,12.4 +gemma-7b-it,INT4-MIXED,1024,6795.4,3118.3,80.6,12.4 +llama-3.1-8b,INT4-MIXED,1024,6324.8,2865.7,81.3,12.3 +gpt-j-6b,INT8-CW,32,6793.2,167.6,85,11.8 +qwen-7b-chat,INT4-MIXED,32,7274.8,168.8,85.2,11.7 +gpt-j-6b,INT8-CW,1024,6793.3,2668.4,88.8,11.3 +qwen-7b-chat,INT4-MIXED,1024,7610.3,2991.9,90.6,11.0 +flan-t5-xxl,INT4-MIXED,1139,23514,540.8,94.9,10.5 +falcon-7b-instruct,INT8-CW,32,7764.1,181.3,95.5,10.5 +llama-2-7b-chat-hf,INT8-CW,32,7330.9,172,96.1,10.4 +falcon-7b-instruct,INT8-CW,1024,7987.4,3072.8,98.1,10.2 +qwen2-7b,INT8-CW,32,8175.3,211.3,99.6,10.0 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv index 09799a2de31fe6..625ff1d6fe5ed5 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv @@ -1,182 +1,117 @@ Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec -opt-125m-gptq,INT4-MIXED,1024,1513.6,81.9,7.8,128.21 -opt-125m-gptq,INT4-MIXED,32,979.9,50.4,7.9,126.58 -tiny-llama-1.1b-chat,INT4-MIXED,1024,1943.3,176.3,16.8,59.52 -tiny-llama-1.1b-chat,INT4-MIXED,32,1982.2,59.5,17.1,58.48 -qwen2-0.5b,INT4-MIXED,32,2678,117.3,18.7,53.48 -tiny-llama-1.1b-chat,INT8-CW,32,2080.9,59.4,19,52.63 -qwen2-0.5b,INT4-MIXED,1024,3036.1,165.5,19.2,52.08 -tiny-llama-1.1b-chat,INT8-CW,1024,2287,241.4,19.6,51.02 -qwen2-0.5b,INT8-CW,1024,3084.9,172.1,20,50.00 -qwen2-0.5b,INT8-CW,32,2518,105.5,21.4,46.73 -red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2793.6,141.8,23.9,41.84 -qwen2-1.5b,INT4-MIXED,32,4515.4,118.7,24,41.67 -qwen2-1.5b,INT4-MIXED,1024,4930.1,229.6,24.3,41.15 -dolly-v2-3b,INT4-MIXED,32,2486.1,174,25.4,39.37 -phi-2,INT4-MIXED,32,2552.9,210.6,26.9,37.17 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,2934.1,464.5,27.5,36.36 -qwen2-1.5b,INT8-CW,32,4813.4,119.1,27.8,35.97 -opt-2.7b,INT4-MIXED,31,3172.5,131.9,28.5,35.09 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1024,3038.2,447.1,28.6,34.97 -dolly-v2-3b,INT4-MIXED,1024,2947.4,409,28.8,34.72 -qwen2-1.5b,INT8-CW,1024,5394.8,327.9,29.3,34.13 -stable-zephyr-3b-dpo,INT4-MIXED,30,2728.1,131.2,29.8,33.56 -phi-2,INT4-MIXED,32,2805.1,208.3,30.2,33.11 -minicpm-1b-sft,INT8-CW,31,3104.2,147.8,30.9,32.36 -phi-2,INT4-MIXED,1024,3058.9,602.9,31.1,32.15 -minicpm-1b-sft,INT4-MIXED,31,2970.1,183.7,31.1,32.15 -stablelm-3b-4e1t,INT4-MIXED,32,3077.1,183.2,31.6,31.65 -opt-2.7b,INT4-MIXED,937,3416.7,429.4,31.6,31.65 -stable-zephyr-3b-dpo,INT4-MIXED,946,3211.8,428.8,32.3,30.96 -phi-3-mini-4k-instruct,INT4-MIXED,31,3014.5,116,32.5,30.77 -phi-3-mini-4k-instruct,INT4-MIXED,38,2957.4,153.9,32.5,30.77 -phi-2,INT4-MIXED,1024,3278.9,613.3,33.4,29.94 -phi-3-mini-4k-instruct,INT4-MIXED,38,3288.5,152.9,33.4,29.94 -phi-3-mini-4k-instruct,INT4-MIXED,31,3265.1,123.6,34.1,29.33 -gemma-2b-it,INT4-MIXED,32,4162.1,208.8,34.2,29.24 -stablelm-3b-4e1t,INT4-MIXED,1024,3525.8,524.5,35,28.57 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3427.8,777.5,36.5,27.40 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3405.4,554.1,36.7,27.25 -gemma-2b-it,INT4-MIXED,1024,5053.1,354.8,36.9,27.10 -minicpm-1b-sft,FP16,31,3595.5,124.9,36.9,27.10 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3547.2,755.8,37.1,26.95 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3528.4,536.4,37.4,26.74 -red-pajama-incite-chat-3b-v1,INT8-CW,32,3747.7,189.9,38.1,26.25 -opt-2.7b,INT8-CW,31,3810.7,145.7,38.5,25.97 -chatglm3-6b,INT4-MIXED,32,4120.7,67.3,38.7,25.84 -dolly-v2-3b,INT8-CW,32,3747,188.4,39.2,25.51 -chatglm3-6b,INT4-MIXED,32,4482.9,69.9,40.7,24.57 -chatglm3-6b,INT4-MIXED,1024,4146,606.8,41,24.39 -opt-2.7b,INT8-CW,937,4458.9,587.8,41.8,23.92 -red-pajama-incite-chat-3b-v1,INT8-CW,1024,4088.4,634.1,41.9,23.87 -red-pajama-incite-chat-3b-v1,INT8-CW,1020,4086.8,653.4,42,23.81 -phi-2,INT8-CW,32,3794.6,202.7,42.1,23.75 -chatglm3-6b,INT4-MIXED,1024,4446.7,598.6,42.3,23.64 -stablelm-3b-4e1t,INT8-CW,32,3652.5,146,42.6,23.47 -stable-zephyr-3b-dpo,INT8-CW,30,3768.6,151.9,42.6,23.47 -dolly-v2-3b,INT8-CW,1024,4092,603.1,42.9,23.31 -stablelm-3b-4e1t,INT8-CW,1024,4143.2,671.7,45.2,22.12 -gemma-2b-it,INT8-CW,32,4878.4,221.6,45.6,21.93 -phi-2,INT8-CW,1024,4153.6,810.3,46,21.74 -llama-2-7b-chat-hf,INT4-MIXED,32,4394.6,109.7,46.2,21.65 -chatglm3-6b-gptq,INT4-MIXED,32,5218.9,79.7,46.7,21.41 -stable-zephyr-3b-dpo,INT8-CW,946,4360.1,627.8,46.8,21.37 -vicuna-7b-v1.5,INT4-MIXED,32,4482.3,101.2,47.2,21.19 -gemma-2b-it,INT8-CW,1024,5837.1,507.1,48,20.83 -llama-2-7b-gptq,INT4-MIXED,32,4734.3,102.8,48.1,20.79 -orca-mini-3b,INT4-MIXED,32,2720.1,132,48.1,20.79 -qwen-7b-chat,INT4-MIXED,32,7803.7,178.5,48.3,20.70 -mistral-7b-v0.1,INT4-MIXED,31,4537.5,99,48.5,20.62 -codegen25-7b,INT4-MIXED,32,4723.3,108.5,48.5,20.62 -chatglm3-6b-gptq,INT4-MIXED,1024,5150.8,614.2,48.8,20.49 -mistral-7b-v0.1,INT4-MIXED,32,4572,102.9,48.8,20.49 -llama-3-8b,INT4-MIXED,33,4991.2,252.2,50.9,19.65 -qwen-7b-chat-gptq,INT4-MIXED,32,8088.4,212.6,51,19.61 -chatglm2-6b,INT4-MIXED,32,4960.6,105.5,51.2,19.53 -gpt-j-6b,INT4-MIXED,32,4699.5,259.2,51.4,19.46 -llama-3.1-8b,INT4-MIXED,31,4897.8,106.9,51.5,19.42 -llama-3-8b,INT4-MIXED,32,4999.7,105.9,51.6,19.38 -qwen-7b-chat,INT4-MIXED,32,8085.9,193.5,51.7,19.34 -falcon-7b-instruct,INT4-MIXED,32,5416.2,175,52.5,19.05 -mistral-7b-v0.1,INT4-MIXED,1007,4772.6,803,52.6,19.01 -qwen1.5-7b-chat,INT4-MIXED,32,6027.3,174.9,53,18.87 -mistral-7b-v0.1,INT4-MIXED,1024,4775,717.6,53,18.87 -llama-2-7b-chat-hf,INT4-MIXED,1024,4976.5,992.1,53.1,18.83 -qwen2-7b,INT4-MIXED,32,7087.1,138.1,53.3,18.76 -llama-2-7b-gptq,INT4-MIXED,1024,5351.2,711.6,53.7,18.62 -llama-3-8b,INT4-MIXED,32,5472.8,109.4,53.7,18.62 -phi-3-mini-4k-instruct,INT8-CW,38,4575.3,115.9,53.7,18.62 -stablelm-7b,INT4-MIXED,32,5213.7,128.5,53.8,18.59 -phi-3-mini-4k-instruct,INT8-CW,31,4571.8,118.9,53.8,18.59 -llama-3-8b,INT4-MIXED,33,5480.4,246.8,53.9,18.55 -llama-3-8b,INT4-MIXED,32,5528.2,144.9,54.3,18.42 -llama-3.1-8b,INT4-MIXED,31,5377.3,112.8,54.3,18.42 -chatglm2-6b,INT4-MIXED,1024,5232.3,759.6,54.6,18.32 -llama-3.1-8b,INT4-MIXED,31,5440.4,126.4,54.8,18.25 -llama-3-8b,INT4-MIXED,33,5532.8,248.2,54.9,18.21 -codegen25-7b,INT4-MIXED,1024,5412.9,714.8,55,18.18 -mistral-7b-v0.1,INT4-MIXED,32,4998.5,117.3,55.2,18.12 -mistral-7b-v0.1,INT4-MIXED,31,5000.2,122.4,55.6,17.99 -llama-3-8b,INT4-MIXED,1024,5594,953.5,56.6,17.67 -gpt-j-6b,INT4-MIXED,1024,5323.8,1254,56.8,17.61 -llama-3-8b,INT4-MIXED,1025,5596.7,1192.3,56.8,17.61 -qwen2-7b,INT4-MIXED,1024,7722.1,714.2,57,17.54 -phi-3-mini-4k-instruct,INT8-CW,1023,5067.1,818.5,57.4,17.42 -phi-3-mini-4k-instruct,INT8-CW,1061,5086.1,975.1,57.4,17.42 -llama-2-7b-chat-hf,INT4-MIXED,32,5087.7,126.2,57.9,17.27 -stablelm-7b,INT4-MIXED,1020,5780.5,1248.4,59,16.95 -llama-3-8b,INT4-MIXED,1025,6088.9,1381.5,59,16.95 -llama-3-8b,INT4-MIXED,1024,6084.8,931.2,59.2,16.89 -llama-3-8b,INT4-MIXED,1025,6141.2,1494.3,59.4,16.84 -llama-3-8b,INT4-MIXED,1024,6133.8,1075.2,59.6,16.78 -mistral-7b-v0.1,INT4-MIXED,1024,5472.6,794.3,59.7,16.75 -zephyr-7b-beta,INT4-MIXED,32,5328.5,103.5,59.8,16.72 -falcon-7b-instruct,INT4-MIXED,1024,5677.5,686.2,59.8,16.72 -mistral-7b-v0.1,INT4-MIXED,1007,5243.5,1074,59.9,16.69 -qwen1.5-7b-chat,INT4-MIXED,1024,7096.7,1132.7,60,16.67 -qwen-7b-chat,INT4-MIXED,1024,8872.6,792.8,61,16.39 -qwen-7b-chat,INT4-MIXED,1024,9164.4,822.6,63.3,15.80 -orca-mini-3b,INT8-CW,32,4221.7,170.6,63.5,15.75 -llama-2-7b-chat-hf,INT4-MIXED,1024,5708.1,1397.9,63.6,15.72 -glm-4-9b,INT4-MIXED,33,6402.9,307.1,63.8,15.67 -zephyr-7b-beta,INT4-MIXED,1024,5572.4,1156.4,64.3,15.55 -glm-4-9b,INT4-MIXED,32,6383.1,256.2,64.5,15.50 -baichuan2-7b-chat,INT4-MIXED,32,5926.3,191.8,65.8,15.20 -opt-2.7b,FP16,31,5886,112.2,68,14.71 -dolly-v2-3b,FP16,32,6161.5,147.5,69.5,14.39 -red-pajama-incite-chat-3b-v1,FP16,32,6265.4,146.2,69.6,14.37 -glm-4-9b,INT4-MIXED,1024,6994.5,1013.7,69.8,14.33 -opt-2.7b,FP16,937,6345,379.5,71.6,13.97 -glm-4-9b,INT4-MIXED,1025,7014.9,1416.8,72.5,13.79 -phi-2,FP16,32,6204.7,189.2,72.9,13.72 -stable-zephyr-3b-dpo,FP16,30,6221.4,159.7,73,13.70 -dolly-v2-3b,FP16,1024,6669.9,424.3,73.3,13.64 -red-pajama-incite-chat-3b-v1,FP16,1020,6658.8,484.7,73.4,13.62 -stablelm-3b-4e1t,FP16,32,6216.3,145.4,73.5,13.61 -qwen-7b-chat,INT4-MIXED,32,9294.9,144.4,73.8,13.55 -red-pajama-incite-chat-3b-v1,FP16,1024,6755.1,469.1,73.9,13.53 -qwen-7b-chat-gptq,INT4-MIXED,1024,9152.1,827.2,75.1,13.32 -gemma-7b-it,INT4-MIXED,32,7991.4,128.6,75.8,13.19 -chatglm2-6b,INT8-CW,32,6854.4,110.2,76.3,13.11 -chatglm3-6b,INT8-CW,32,6754.8,112.3,76.4,13.09 -stable-zephyr-3b-dpo,FP16,946,6940,428.6,76.7,13.04 -baichuan2-7b-chat,INT4-MIXED,1024,6930.2,1229.5,76.7,13.04 -gemma-7b-it,INT4-MIXED,32,8061.5,125.6,76.7,13.04 -stablelm-3b-4e1t,FP16,1024,6722.9,480.8,77,12.99 -phi-2,FP16,1024,6709.4,624.1,77.2,12.95 -chatglm2-6b,INT8-CW,1024,7132.9,1361.9,78.7,12.71 -chatglm3-6b,INT8-CW,1024,7037.5,1389.2,78.7,12.71 -qwen-7b-chat,INT4-MIXED,1024,10374.1,1357.5,81.1,12.33 -gemma-7b-it,INT4-MIXED,1024,9398,1268.5,82.7,12.09 -gemma-7b-it,INT4-MIXED,1024,9469.5,1268,83.2,12.02 -gpt-j-6b,INT8-CW,32,7126.5,255.2,87.2,11.47 -falcon-7b-instruct,INT8-CW,32,8287.6,131.1,88.4,11.31 -llama-2-7b-chat-hf,INT8-CW,32,7474.9,139.5,89.7,11.15 -codegen25-7b,INT8-CW,32,7559.4,138,90.8,11.01 -vicuna-7b-v1.5,INT8-CW,32,7390.8,136.6,90.8,11.01 -falcon-7b-instruct,INT8-CW,1024,8546.8,1205.9,92.2,10.85 -stablelm-7b,INT8-CW,32,8356.4,143,92.4,10.82 -qwen2-7b,INT8-CW,32,9940.7,132,92.5,10.81 -baichuan2-13b-chat,INT4-MIXED,32,9879.2,184.9,93.3,10.72 -phi-3-mini-4k-instruct,FP16,38,8290,125.2,93.4,10.71 -phi-3-mini-4k-instruct,FP16,31,8290.5,109.5,93.5,10.70 -gpt-j-6b,INT8-CW,1024,7759,1996.8,93.9,10.65 -llama-2-7b-chat-hf,INT8-CW,1024,8097.8,1701.6,94.7,10.56 -phi-3-medium-4k-instruct,INT4-MIXED,38,8210.4,527,95.1,10.52 -mistral-7b-v0.1,INT8-CW,31,7882.4,128.6,95.1,10.52 -vicuna-7b-v1.5,INT8-CW,1024,8013.2,1558.1,95.1,10.52 -mistral-7b-v0.1,INT8-CW,32,7886.9,140.6,95.2,10.50 -qwen2-7b,INT8-CW,1024,10573.1,1564.5,95.3,10.49 -codegen25-7b,INT8-CW,1024,8253.1,1526.3,95.7,10.45 -zephyr-7b-beta,INT8-CW,32,7785.3,144.4,95.8,10.44 -stablelm-7b,INT8-CW,1020,8921.9,1845,96.9,10.32 -mistral-7b-v0.1,INT8-CW,1007,8127.4,1648.4,97.4,10.27 -qwen-7b-chat,INT8-CW,32,11083.2,140.6,97.7,10.24 -qwen1.5-7b-chat,INT8-CW,32,8870,156.4,98.1,10.19 -llama-3.1-8b,INT8-CW,31,8600.3,189.2,98.4,10.16 -mistral-7b-v0.1,INT8-CW,1024,8134.7,1554.1,98.4,10.16 -qwen-14b-chat,INT4-MIXED,32,9876.2,192.3,98.6,10.14 -zephyr-7b-beta,INT8-CW,1024,8035.2,1580.4,98.8,10.12 -llama-3-8b,INT8-CW,32,8694.2,150.7,99.5,10.05 -llama-3-8b,INT8-CW,33,8700.4,175.4,99.8,10.02 -phi-3-mini-4k-instruct,FP16,1023,8795.2,601.3,99.9,10.01 +opt-125m-gptq,INT4-MIXED,32,1150.2,35.1,8.2,122.0 +opt-125m-gptq,INT4-MIXED,1024,1228,67,8.2,122.0 +qwen2-0.5b,INT4-MIXED,1024,1596.2,83.6,14.4,69.4 +qwen2-0.5b,INT4-MIXED,32,1675.6,63.6,14.9,67.1 +qwen2-0.5b,INT8-CW,32,1857.5,56.9,15,66.7 +qwen2-0.5b,INT8-CW,1024,1663.5,87,15,66.7 +bloomz-560m,INT8-CW,32,1761.1,62.4,15.1,66.2 +tiny-llama-1.1b-chat,INT4-MIXED,1024,1687.9,158.7,15.3,65.4 +bloomz-560m,INT4-MIXED,32,1894.2,40.1,15.4,64.9 +tiny-llama-1.1b-chat,INT4-MIXED,32,1833,74.5,15.7,63.7 +bloomz-560m,INT8-CW,1024,1689.2,146.2,15.8,63.3 +bloomz-560m,INT4-MIXED,1024,1791,150.1,16.4,61.0 +tiny-llama-1.1b-chat,INT8-CW,32,2132.3,35.6,18.1,55.2 +bloomz-560m,FP16,32,2395,36,18.4,54.3 +tiny-llama-1.1b-chat,INT8-CW,1024,1986.4,149.3,19.2,52.1 +bloomz-560m,FP16,1024,2344.4,157.4,19.3,51.8 +qwen2-1.5b,INT4-MIXED,1024,2175.1,184.9,20.4,49.0 +qwen2-1.5b,INT4-MIXED,32,2066.2,94.9,20.6,48.5 +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2599.8,118.1,25,40.0 +qwen2-1.5b,INT8-CW,32,2377.4,83.3,25.1,39.8 +qwen2-1.5b,INT8-CW,1024,2483.3,189.6,25.3,39.5 +gemma-2b-it,INT4-MIXED,32,2594.3,181.4,26.1,38.3 +phi-2,INT4-MIXED,32,2912.4,77.7,26.8,37.3 +gemma-2b-it,INT4-MIXED,1024,2594.4,248.2,26.9,37.2 +dolly-v2-3b,INT4-MIXED,32,2610.3,141.3,27,37.0 +stable-zephyr-3b-dpo,INT4-MIXED,32,2956.2,149.2,27.4,36.5 +minicpm-1b-sft,INT4-MIXED,31,2625.8,159.2,28.1,35.6 +red-pajama-incite-chat-3b-v1,INT4-MIXED,1023,3069.7,413.5,28.2,35.5 +minicpm-1b-sft,INT8-CW,31,2868.2,74.1,28.9,34.6 +dolly-v2-3b,INT4-MIXED,1024,3081.5,386,29.4,34.0 +phi-2,INT4-MIXED,1024,3136.2,340,29.6,33.8 +stablelm-3b-4e1t,INT4-MIXED,32,3035.9,150.5,30.6,32.7 +phi-3-mini-4k-instruct,INT4-MIXED,32,3373.2,57.9,32.6,30.7 +stablelm-3b-4e1t,INT4-MIXED,1023,3296.5,456.2,34.4,29.1 +phi-3-mini-4k-instruct,INT4-MIXED,1024,3707.1,432,36.1,27.7 +gemma-2b-it,INT8-CW,32,3370.5,203.8,36.6,27.3 +minicpm-1b-sft,FP16,31,3679.6,80.6,36.9,27.1 +gemma-2b-it,INT8-CW,1024,3503.2,258.5,37.9,26.4 +dolly-v2-3b,INT8-CW,32,3893.3,142.9,39.4,25.4 +red-pajama-incite-chat-3b-v1,INT8-CW,32,3760.7,117.2,39.4,25.4 +phi-2,INT8-CW,32,3765.6,121,39.7,25.2 +stablelm-3b-4e1t,INT8-CW,32,3641.2,123,39.9,25.1 +stable-zephyr-3b-dpo,INT8-CW,32,3743.3,120.1,39.9,25.1 +red-pajama-incite-chat-3b-v1,INT8-CW,1023,4083.1,422.9,41.9,23.9 +dolly-v2-3b,INT8-CW,1024,4211.5,384.1,42.2,23.7 +phi-2,INT8-CW,1024,4096.8,367.2,42.5,23.5 +stablelm-3b-4e1t,INT8-CW,1023,4086.6,459.9,43.5,23.0 +llama-2-7b-gptq,INT4-MIXED,32,4754.8,75.1,46.2,21.6 +codegen25-7b,INT4-MIXED,32,4738.5,74.9,46.9,21.3 +gpt-j-6b,INT4-MIXED,32,4506.5,221.4,47.3,21.1 +decilm-7b-instruct,INT4-MIXED,36,4794.9,199.3,48.5,20.6 +qwen-7b-chat-gptq,INT4-MIXED,32,5615.8,100.5,49.8,20.1 +falcon-7b-instruct,INT4-MIXED,32,4738,79.9,50.7,19.7 +phi-3-mini-4k-instruct,INT8-CW,32,4589.9,83,50.8,19.7 +llama-2-7b-gptq,INT4-MIXED,1024,5246,640,52.1,19.2 +llama-3-8b,INT4-MIXED,33,5475.8,114.7,52.2,19.2 +codegen25-7b,INT4-MIXED,1024,5241.9,643.7,52.5,19.0 +mistral-7b-v0.1,INT4-MIXED,32,5015.3,94.6,52.6,19.0 +qwen2-7b,INT4-MIXED,32,5330.7,86.3,52.7,19.0 +gpt-j-6b,INT4-MIXED,1024,4926.5,867.2,53.2,18.8 +llama-2-7b-chat-hf,INT4-MIXED,32,5100.7,78.7,54.2,18.5 +llama-3-8b,INT4-MIXED,33,5527.1,114.9,54.3,18.4 +phi-3-mini-4k-instruct,INT8-CW,1024,4959.2,450.6,54.6,18.3 +falcon-7b-instruct,INT4-MIXED,1024,4863.4,660.5,54.9,18.2 +qwen2-7b,INT4-MIXED,1024,5375.4,659.8,55.4,18.1 +mistral-7b-v0.1,INT4-MIXED,1024,5286.8,662.8,55.6,18.0 +llama-3-8b,INT4-MIXED,1025,5601,992.5,56.1,17.8 +llama-3-8b,INT4-MIXED,1025,5646.8,1047.1,56.7,17.6 +baichuan2-7b-chat,INT4-MIXED,32,5913.7,86.5,57.2,17.5 +zephyr-7b-beta,INT4-MIXED,32,5339.7,88.5,58.2,17.2 +qwen-7b-chat-gptq,INT4-MIXED,1024,6315.8,664.2,60.1,16.6 +glm-4-9b-chat,INT4-MIXED,32,6349.7,86.5,60.5,16.5 +llama-2-7b-chat-hf,INT4-MIXED,1024,5592.7,856.8,60.9,16.4 +zephyr-7b-beta,INT4-MIXED,1024,5459.1,898.6,61.6,16.2 +baichuan2-7b-chat,INT4-MIXED,1024,6410.3,942.2,63.5,15.7 +gemma-7b-it,INT4-MIXED,32,5816.3,104.5,63.5,15.7 +glm-4-9b-chat,INT4-MIXED,1024,6368.8,1128.2,63.8,15.7 +llama-3.1-8b,INT4-MIXED,32,6315.3,97.4,65,15.4 +llama-3.1-8b,INT4-MIXED,1024,6421.8,902.9,68.2,14.7 +gemma-7b-it,INT4-MIXED,1024,6233.2,1052.7,68.7,14.6 +qwen-7b-chat,INT4-MIXED,32,7320.5,132.3,68.8,14.5 +red-pajama-incite-chat-3b-v1,FP16,32,6318.9,79.2,70.7,14.1 +phi-2,FP16,32,6330.2,83.2,70.8,14.1 +dolly-v2-3b,FP16,32,6327.2,92.7,71.9,13.9 +stable-zephyr-3b-dpo,FP16,32,6356.4,79.8,72.2,13.9 +stablelm-3b-4e1t,FP16,32,6261.9,74.6,72.6,13.8 +phi-2,FP16,1024,6654.4,379.3,73.9,13.5 +red-pajama-incite-chat-3b-v1,FP16,1023,6640.3,442.6,74.4,13.4 +dolly-v2-3b,FP16,1024,6653.9,441.9,74.9,13.4 +qwen-7b-chat,INT4-MIXED,1024,7814.1,909.4,75.5,13.2 +stablelm-3b-4e1t,FP16,1023,6575.3,449.5,75.8,13.2 +falcon-7b-instruct,INT8-CW,32,7487.6,109.4,84.3,11.9 +gpt-j-6b,INT8-CW,32,6918.7,185.3,85.3,11.7 +llama-2-7b-chat-hf,INT8-CW,32,7494.7,110.6,87.9,11.4 +qwen2-7b,INT8-CW,32,8177.7,117.8,88.2,11.3 +falcon-7b-instruct,INT8-CW,1024,7621.2,675.4,88.3,11.3 +codegen25-7b,INT8-CW,32,7582.1,114.6,89,11.2 +qwen2-7b,INT8-CW,1024,8226.2,842,90.4,11.1 +gpt-j-6b,INT8-CW,1024,7353.1,1093.9,90.8,11.0 +phi-3-medium-4k-instruct,INT4-MIXED,38,8184.1,270.2,90.8,11.0 +qwen-7b-chat,INT8-CW,32,9223.8,138.4,91.3,11.0 +baichuan2-7b-chat,INT8-CW,32,8188.4,122.9,91.8,10.9 +phi-3-mini-4k-instruct,FP16,32,8311.5,98.2,92,10.9 +llama-2-7b-chat-hf,INT8-CW,1024,7984.3,874.9,92.8,10.8 +mistral-7b-v0.1,INT8-CW,32,7908.6,116.3,93.1,10.7 +baichuan2-13b-chat,INT4-MIXED,32,10016.5,165.7,93.2,10.7 +zephyr-7b-beta,INT8-CW,32,7812.6,117,93.4,10.7 +codegen25-7b,INT8-CW,1024,8074.3,870.2,94,10.6 +decilm-7b-instruct,INT8-CW,36,7885.2,181.4,94.9,10.5 +mistral-7b-v0.1,INT8-CW,1024,8023.7,906.4,95.7,10.4 +zephyr-7b-beta,INT8-CW,1024,7930.8,915.2,96.3,10.4 +phi-3-medium-4k-instruct,INT4-MIXED,1061,8384.5,2225.7,96.7,10.3 +baichuan2-7b-chat,INT8-CW,1024,8678.3,956.7,96.8,10.3 +llama-3.1-8b,INT8-CW,32,8615.4,121.6,97.7,10.2 +llama-3-8b,INT8-CW,33,8615.1,131.3,97.7,10.2 +phi-3-mini-4k-instruct,FP16,1024,8695.2,509,99.9,10.0 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv index b16312fa09457c..c1932e678505ff 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv @@ -1,146 +1,83 @@ Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec -opt-125m-gptq,INT4-MIXED,1024,1610.2,146,9.4,106.38 -opt-125m-gptq,INT4-MIXED,32,1087.6,60.8,9.5,105.26 -tiny-llama-1.1b-chat,INT4-MIXED,32,1977,85.7,20.2,49.50 -tiny-llama-1.1b-chat,INT4-MIXED,1024,1940.8,367.7,20.3,49.26 -tiny-llama-1.1b-chat,INT8-CW,32,1855.2,70.2,21.8,45.87 -qwen2-0.5b,INT4-MIXED,1024,3029.3,226.4,22.3,44.84 -qwen2-0.5b,INT8-CW,1024,3093,222,22.3,44.84 -qwen2-0.5b,FP16,1024,2509.5,234.3,22.4,44.64 -qwen2-0.5b,FP16,32,1933.8,146.4,22.4,44.64 -tiny-llama-1.1b-chat,INT8-CW,1024,2288.3,368.6,22.9,43.67 -qwen2-0.5b,INT4-MIXED,32,2670.9,115.1,23,43.48 -qwen2-0.5b,INT8-CW,32,2530,157.9,24.3,41.15 -red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2677.3,186.1,27.9,35.84 -qwen2-1.5b,INT4-MIXED,32,4515.1,179.8,28.7,34.84 -qwen2-1.5b,INT4-MIXED,1024,4927.5,254.3,29.1,34.36 -dolly-v2-3b,INT4-MIXED,32,2420.9,245.6,30.8,32.47 -qwen2-1.5b,INT8-CW,32,4824.9,165.1,31.2,32.05 -phi-2,INT4-MIXED,32,2523.5,233.9,31.5,31.75 -qwen2-1.5b,INT8-CW,1024,5401.8,331.1,32,31.25 -stable-zephyr-3b-dpo,INT4-MIXED,30,2816.2,151.3,32.9,30.40 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,2646.7,860.6,33,30.30 -opt-2.7b,INT4-MIXED,31,2814.5,174.7,33.1,30.21 -phi-2,INT4-MIXED,32,2363.6,236.6,34,29.41 -stablelm-3b-4e1t,INT4-MIXED,32,3079.1,220,34,29.41 -minicpm-1b-sft,INT4-MIXED,31,2971,185.1,34.1,29.33 -minicpm-1b-sft,INT8-CW,31,3103.6,233.5,34.3,29.15 -dolly-v2-3b,INT4-MIXED,1024,2152.3,876.6,34.7,28.82 -phi-3-mini-4k-instruct,INT4-MIXED,38,2951,155.4,35.9,27.86 -phi-2,INT4-MIXED,1024,2689.9,971.7,36.5,27.40 -stablelm-3b-4e1t,INT4-MIXED,1024,3335.9,519.3,37.3,26.81 -opt-2.7b,INT4-MIXED,937,3227.5,639.5,37.7,26.53 -phi-3-mini-4k-instruct,INT4-MIXED,38,3289.7,161,37.9,26.39 -gemma-2b-it,INT4-MIXED,32,4099.6,258.6,38,26.32 -tiny-llama-1.1b-chat,FP16,32,3098.7,143.9,38.2,26.18 -stable-zephyr-3b-dpo,INT4-MIXED,946,3548.5,453.9,38.8,25.77 -tiny-llama-1.1b-chat,FP16,1024,3388.6,523,39,25.64 -phi-2,INT4-MIXED,1024,2594.7,964.2,39.1,25.58 -minicpm-1b-sft,FP16,31,3597.7,164.8,39.8,25.13 -gemma-2b-it,INT4-MIXED,1024,5059.1,669.1,40.5,24.69 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3431.8,840.1,40.6,24.63 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3555.6,836.3,41.8,23.92 -qwen2-1.5b,FP16,32,3979.4,111.8,42.5,23.53 -red-pajama-incite-chat-3b-v1,INT8-CW,32,3639.9,199.1,43.6,22.94 -qwen2-1.5b,FP16,1024,4569.8,250.5,44.1,22.68 -dolly-v2-3b,INT8-CW,32,3727,248.2,44.5,22.47 -opt-2.7b,INT8-CW,31,3746.3,175.6,44.6,22.42 -stablelm-3b-4e1t,INT8-CW,32,3651.3,178,45.4,22.03 -chatglm3-6b,INT4-MIXED,32,4050.3,88.1,47.4,21.10 -phi-2,INT8-CW,32,3608.7,232,48.3,20.70 -red-pajama-incite-chat-3b-v1,INT8-CW,1020,2951,816.6,48.4,20.66 -stablelm-3b-4e1t,INT8-CW,1024,4142.8,658.7,48.5,20.62 -opt-2.7b,INT8-CW,937,4019,640.7,48.8,20.49 -stable-zephyr-3b-dpo,INT8-CW,30,3264.5,150.7,48.8,20.49 -gemma-2b-it,INT8-CW,32,4874.7,249.4,48.9,20.45 -chatglm3-6b,INT4-MIXED,32,3902.1,84.9,49.5,20.20 -dolly-v2-3b,INT8-CW,1024,2931.4,865.2,49.7,20.12 -gemma-2b-it,INT8-CW,1024,5834,545.4,50.7,19.72 -vicuna-7b-v1.5,INT4-MIXED,32,4560.3,119.4,50.7,19.72 -chatglm3-6b,INT4-MIXED,1024,4070.1,895.9,50.9,19.65 -chatglm3-6b,INT4-MIXED,1024,3832.1,854.4,52,19.23 -orca-mini-3b,INT4-MIXED,32,2345.5,132.8,52.2,19.16 -phi-2,INT8-CW,1024,3511.6,989.7,53.1,18.83 -chatglm2-6b,INT4-MIXED,32,4960.2,91.5,54.2,18.45 -qwen1.5-7b-chat,INT4-MIXED,32,5936.5,195.7,54.8,18.25 -stable-zephyr-3b-dpo,INT8-CW,946,3700.5,677.9,54.8,18.25 -llama-2-7b-chat-hf,INT4-MIXED,32,4010.5,113.7,55.6,17.99 -qwen-7b-chat,INT4-MIXED,32,7393,132.7,56.1,17.83 -chatglm2-6b,INT4-MIXED,1024,5234.5,747.3,56.2,17.79 -qwen2-7b,INT4-MIXED,32,7086.2,183,56.3,17.76 -phi-3-mini-4k-instruct,INT8-CW,38,4574.4,132.9,56.9,17.57 -llama-2-7b-gptq,INT4-MIXED,32,4134.1,120,58,17.24 -chatglm3-6b-gptq,INT4-MIXED,32,4288.1,99.4,58.1,17.21 -qwen2-7b,INT4-MIXED,1024,7716.4,734.9,58.3,17.15 -mistral-7b-v0.1,INT4-MIXED,31,4509.3,115,58.6,17.06 -codegen25-7b,INT4-MIXED,32,4211.8,136.5,59,16.95 -qwen1.5-7b-chat,INT4-MIXED,1024,7007.2,792.7,60.6,16.50 -chatglm3-6b-gptq,INT4-MIXED,1024,4545.4,860.3,60.9,16.42 -phi-3-mini-4k-instruct,INT8-CW,1061,5087.2,1029.5,60.9,16.42 -gpt-j-6b,INT4-MIXED,32,4013.5,316.1,61.1,16.37 -mistral-7b-v0.1,INT4-MIXED,1007,876.5,984.4,61.7,16.21 -llama-3-8b,INT4-MIXED,32,4357.1,132.8,62,16.13 -llama-2-7b-chat-hf,INT4-MIXED,1024,3564.8,1163.7,62.5,16.00 -qwen-7b-chat-gptq,INT4-MIXED,32,7384.1,217.8,62.9,15.90 -zephyr-7b-beta,INT4-MIXED,32,5331.6,125,62.9,15.90 -qwen-7b-chat,INT4-MIXED,32,6545.8,218.7,63,15.87 -llama-3.1-8b,INT4-MIXED,31,5076.3,110.4,63.4,15.77 -llama-3.1-8b,INT4-MIXED,31,4419,145.6,63.5,15.75 -llama-2-7b-gptq,INT4-MIXED,1024,3434.2,921.6,64.4,15.53 -llama-3-8b,INT4-MIXED,32,4886.7,132.3,65.4,15.29 -stablelm-7b,INT4-MIXED,32,4768.4,132.1,65.5,15.27 -codegen25-7b,INT4-MIXED,1024,1429.7,967.5,65.7,15.22 -zephyr-7b-beta,INT4-MIXED,1024,5575.6,837.2,65.7,15.22 -llama-3-8b,INT4-MIXED,32,4888.3,161.8,66.2,15.11 -mistral-7b-v0.1,INT4-MIXED,31,4401.4,142.7,66.2,15.11 -llama-3-8b,INT4-MIXED,1024,3782.4,1091.5,66.8,14.97 -llama-3.1-8b,INT4-MIXED,31,4781.4,159.4,67,14.93 -glm-4-9b,INT4-MIXED,33,6392.6,298.7,67.2,14.88 -qwen-7b-chat,INT4-MIXED,1024,8472.8,1331.2,67.4,14.84 -gpt-j-6b,INT4-MIXED,1024,1237.8,1638.8,68.1,14.68 -llama-2-7b-chat-hf,INT4-MIXED,32,4497.4,153.2,68.7,14.56 -llama-3-8b,INT4-MIXED,1024,4526.9,1060.3,69.8,14.33 -mistral-7b-v0.1,INT4-MIXED,1007,3968.7,1033.1,69.9,14.31 -llama-3-8b,INT4-MIXED,1024,4297.9,1041.7,70,14.29 -orca-mini-3b,INT8-CW,32,3744.3,174,70.5,14.18 -stablelm-7b,INT4-MIXED,1020,4402.1,1186.4,70.5,14.18 -gemma-2b-it,FP16,32,5806.3,117.6,71.8,13.93 -glm-4-9b,INT4-MIXED,1025,7003.5,1354.2,72.5,13.79 -gemma-2b-it,FP16,1024,6804.7,490.6,73.4,13.62 -stablelm-3b-4e1t,FP16,32,6217,207.5,75.2,13.30 -llama-2-7b-chat-hf,INT4-MIXED,1024,4320.9,1247.7,75.8,13.19 -gemma-7b-it,INT4-MIXED,32,8050.6,134.6,76.1,13.14 -gemma-7b-it,INT4-MIXED,32,7992.6,146.4,76.1,13.14 -qwen-7b-chat,INT4-MIXED,1024,5712.7,1144.4,77.1,12.97 -stablelm-3b-4e1t,FP16,1024,6722.9,491.4,77.7,12.87 -chatglm2-6b,INT8-CW,32,6856.2,111.6,78.9,12.67 -opt-2.7b,FP16,31,5377.5,138,79.6,12.56 -chatglm2-6b,INT8-CW,1024,7133.8,1012.1,81,12.35 -red-pajama-incite-chat-3b-v1,FP16,32,5672.5,211,81.2,12.32 -gemma-7b-it,INT4-MIXED,1024,9399.5,1726.7,82.2,12.17 -dolly-v2-3b,FP16,32,5573,230.6,82.5,12.12 -gemma-7b-it,INT4-MIXED,1024,9460,1241.2,82.7,12.09 -opt-2.7b,FP16,937,4727.8,618.8,84.6,11.82 -baichuan2-7b-chat,INT4-MIXED,32,5782.4,274.1,84.8,11.79 -phi-2,FP16,32,5497.3,244.9,85,11.76 -stable-zephyr-3b-dpo,FP16,30,5714.8,173.1,86,11.63 -red-pajama-incite-chat-3b-v1,FP16,1020,5262.2,817.4,86.2,11.60 -dolly-v2-3b,FP16,1024,2376.1,935.5,87,11.49 -qwen-7b-chat,INT4-MIXED,32,8597.4,226.2,87.7,11.40 -phi-2,FP16,1024,4063.9,969.8,89.7,11.15 -chatglm3-6b,INT8-CW,32,6158.8,123.4,89.8,11.14 -stable-zephyr-3b-dpo,FP16,946,5337.1,781.4,90.5,11.05 -baichuan2-7b-chat,INT4-MIXED,1024,807.4,1725.7,91.8,10.89 -vicuna-7b-v1.5,INT8-CW,32,7391,171.3,92.5,10.81 -chatglm3-6b,INT8-CW,1024,550.7,1210.9,93.3,10.72 -phi-3-mini-4k-instruct,FP16,38,8299.3,142,94.1,10.63 -qwen2-7b,INT8-CW,32,9941.1,139.1,94.9,10.54 -qwen-7b-chat-gptq,INT4-MIXED,1024,6545,1103.9,95.8,10.44 -qwen2-7b,INT8-CW,1024,10575.1,1183,96.7,10.34 -qwen-7b-chat,INT4-MIXED,1024,6777.4,1309.6,96.9,10.32 -vicuna-7b-v1.5,INT8-CW,1024,8013.7,1154.6,96.9,10.32 -phi-3-medium-4k-instruct,INT4-MIXED,38,8212.8,448.3,97,10.31 -zephyr-7b-beta,INT8-CW,32,7888,144.8,97.4,10.27 -phi-3-mini-4k-instruct,FP16,1061,8814.8,1195.7,98.7,10.13 -zephyr-7b-beta,INT8-CW,1024,8136.7,1191.6,99.4,10.06 -llama-2-13b-chat-hf,INT4-MIXED,32,6927.5,165.3,99.9,10.01 +opt-125m-gptq,INT4-MIXED,32,833.1,15.6,3.9,256.4 +opt-125m-gptq,INT4-MIXED,1024,955.9,553.8,4.8,208.3 +bloomz-560m,INT4-MIXED,32,1457.5,48.5,11.1,90.1 +qwen2-0.5b,INT4-MIXED,32,1167.8,95.7,11.5,87.0 +qwen2-0.5b,INT4-MIXED,1024,1266,2330.3,12.7,78.7 +qwen2-0.5b,INT8-CW,32,1496.3,90.5,12.8,78.1 +bloomz-560m,INT8-CW,32,1724.2,84,13.9,71.9 +qwen2-0.5b,INT8-CW,1024,1593,2370.7,14,71.4 +bloomz-560m,INT4-MIXED,1024,1691,2005.3,15.2,65.8 +qwen2-0.5b,FP16,32,2989.8,94.6,15.9,62.9 +bloomz-560m,INT8-CW,1024,1941,2343.4,16.1,62.1 +qwen2-0.5b,FP16,1024,3088.1,2376.8,17.4,57.5 +bloomz-560m,FP16,32,3857,86.7,17.5,57.1 +bloomz-560m,FP16,1024,4085.6,2373.4,19.8,50.5 +tiny-llama-1.1b-chat,INT4-MIXED,32,1738.9,237.4,20,50.0 +tiny-llama-1.1b-chat,INT8-CW,32,2471.2,224.6,22.6,44.2 +tiny-llama-1.1b-chat,INT4-MIXED,1024,1929.3,5993,22.7,44.1 +tiny-llama-1.1b-chat,INT8-CW,1024,2661.8,6238.8,25.2,39.7 +qwen2-1.5b,INT4-MIXED,32,2429,312.8,28.4,35.2 +tiny-llama-1.1b-chat,FP16,32,4834.9,231.7,28.9,34.6 +tiny-llama-1.1b-chat,FP16,1024,5023.2,6191.5,31.7,31.5 +qwen2-1.5b,INT4-MIXED,1024,2600.3,7597.3,31.8,31.4 +stablelm-3b-4e1t,INT4-MIXED,32,3982.1,348.4,32.1,31.2 +qwen2-1.5b,INT8-CW,32,3619,301,32.7,30.6 +qwen2-1.5b,INT8-CW,1024,3790.3,7990.5,34.6,28.9 +stablelm-3b-4e1t,INT4-MIXED,1023,4455.4,11963.2,39.2,25.5 +minicpm-1b-sft,INT4-MIXED,31,5815.4,214.3,40.1,24.9 +qwen2-1.5b,FP16,32,7582.3,304.4,42.2,23.7 +minicpm-1b-sft,INT8-CW,31,6609.6,210.6,43.3,23.1 +qwen2-1.5b,FP16,1024,7753.4,7915.3,44.2,22.6 +gemma-2b-it,INT4-MIXED,32,3728.2,523,46.2,21.6 +stable-zephyr-3b-dpo,INT4-MIXED,32,3689.3,656.5,47.4,21.1 +gemma-2b-it,INT4-MIXED,1024,4207.3,11867.9,47.5,21.1 +minicpm-1b-sft,FP16,31,8999.8,222.2,49.1,20.4 +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,3448.1,1028.9,49.6,20.2 +dolly-v2-3b,INT4-MIXED,32,3448.4,714.8,49.9,20.0 +gemma-2b-it,INT8-CW,32,5423.2,488.8,51,19.6 +gemma-2b-it,INT8-CW,1024,5902.7,12434.4,52.3,19.1 +stable-zephyr-3b-dpo,INT8-CW,32,5630.3,694.5,54.4,18.4 +phi-2,INT4-MIXED,32,3732.9,723.2,54.5,18.3 +phi-2,INT8-CW,32,5600.4,747,55.7,18.0 +dolly-v2-3b,INT8-CW,32,5589.7,1009.8,55.9,17.9 +red-pajama-incite-chat-3b-v1,INT8-CW,32,5590.1,698.9,55.9,17.9 +stablelm-3b-4e1t,INT8-CW,32,5630.1,660.7,56.1,17.8 +dolly-v2-3b,INT4-MIXED,1024,3984.5,15502.8,56.5,17.7 +red-pajama-incite-chat-3b-v1,INT4-MIXED,1023,3915.6,15363.9,56.6,17.7 +llama-2-7b-gptq,INT4-MIXED,32,8618.5,782.9,56.9,17.6 +phi-2,INT4-MIXED,1024,4251.3,15317,61,16.4 +phi-2,INT8-CW,1024,6119.4,15886.6,62,16.1 +red-pajama-incite-chat-3b-v1,INT8-CW,1023,6056.9,15984.9,62.2,16.1 +dolly-v2-3b,INT8-CW,1024,6124.9,16099.7,62.5,16.0 +stablelm-3b-4e1t,INT8-CW,1023,6097.1,16206.9,62.5,16.0 +gemma-2b-it,FP16,32,12208.2,501.4,65.5,15.3 +llama-3-8b,INT4-MIXED,33,8741.2,869,65.7,15.2 +llama-2-7b-gptq,INT4-MIXED,1024,9468.1,26350.7,66.1,15.1 +qwen-7b-chat-gptq,INT4-MIXED,32,8561,773.7,67,14.9 +gemma-2b-it,FP16,1024,12687.8,12168.7,67.1,14.9 +mistral-7b-v0.1,INT4-MIXED,32,8588.7,1020.6,67.4,14.8 +llama-2-7b-chat-hf,INT4-MIXED,32,8626.8,1100,69.4,14.4 +phi-2,FP16,32,11385.9,693.8,70.2,14.2 +dolly-v2-3b,FP16,32,11359,688.5,70.5,14.2 +stable-zephyr-3b-dpo,FP16,32,11432.9,648.5,70.6,14.2 +red-pajama-incite-chat-3b-v1,FP16,32,11364,692.4,70.7,14.1 +stablelm-3b-4e1t,FP16,32,11432.6,649,71.1,14.1 +llama-3-8b,INT4-MIXED,1025,9254.8,29700.3,71.9,13.9 +mistral-7b-v0.1,INT4-MIXED,1024,9121.9,29492.9,73.3,13.6 +phi-3-mini-4k-instruct,INT8-CW,32,7646.1,952.6,75.7,13.2 +qwen-7b-chat-gptq,INT4-MIXED,1024,10458.7,29022.2,75.9,13.2 +zephyr-7b-beta,INT4-MIXED,32,9217.5,1196.6,76.2,13.1 +phi-2,FP16,1024,11902.2,15868,77,13.0 +dolly-v2-3b,FP16,1024,11892.5,15987.1,77.1,13.0 +baichuan2-7b-chat,INT4-MIXED,32,9440.3,1118.1,77.3,12.9 +red-pajama-incite-chat-3b-v1,FP16,1023,11829.1,16008.7,77.3,12.9 +stablelm-3b-4e1t,FP16,1023,11897.5,16030,77.7,12.9 +phi-3-mini-4k-instruct,INT4-MIXED,32,4961.9,968.8,78.2,12.8 +llama-2-7b-chat-hf,INT4-MIXED,1024,9478.1,28958.6,78.6,12.7 +zephyr-7b-beta,INT4-MIXED,1024,9764.2,30982,82.3,12.2 +phi-3-mini-4k-instruct,INT8-CW,1024,8255.7,23200.5,83.1,12.0 +phi-3-mini-4k-instruct,INT4-MIXED,1024,5570.2,22277.1,85.7,11.7 +baichuan2-7b-chat,INT4-MIXED,1024,10305.2,29010,86.4,11.6 +phi-3-mini-4k-instruct,FP16,32,15292.6,934.7,96.4,10.4 +qwen-7b-chat,INT4-MIXED,32,10964.7,1413,97.8,10.2 \ No newline at end of file diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf b/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf index bedd9c28286476..53198c7ddb7089 100644 Binary files a/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf and b/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf differ diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css index f922069c45e354..de8a05732a4d06 100644 --- a/docs/sphinx_setup/_static/css/custom.css +++ b/docs/sphinx_setup/_static/css/custom.css @@ -923,6 +923,8 @@ h5 { position: relative; bottom: -16px; left: 0; + margin-left: auto; + padding-right: 30px; } .modal-footer-content { diff --git a/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf b/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf new file mode 100644 index 00000000000000..13edfc8f0b7bc2 Binary files /dev/null and b/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf differ diff --git a/docs/sphinx_setup/_static/download/supported_models.csv b/docs/sphinx_setup/_static/download/supported_models.csv index 87ea37b0f207c3..39053fa6d3e0a7 100644 --- a/docs/sphinx_setup/_static/download/supported_models.csv +++ b/docs/sphinx_setup/_static/download/supported_models.csv @@ -715,7 +715,6 @@ tiny-random-BeitForImageClassification,Image Classification,pytorch,intel-optimu tiny-random-bert,Natural Language Processing,pytorch,intel-optimum default,+,, tiny-random-BlenderbotModel,Large Language Model,pytorch,INT4,+,, tiny-random-BloomModel,Large Language Model,pytorch,INT4,+,, -tiny-random-chatglm2,Large Language Model,pytorch,INT4,+,, tiny-random-codegen2,Large Language Model,pytorch,INT4,+,, tiny-random-CodeGenForCausalLM,Large Language Model,pytorch,INT4,+,, tiny-random-CohereForCausalLM,Large Language Model,pytorch,INT4,+,, diff --git a/docs/sphinx_setup/_static/html/modal.html b/docs/sphinx_setup/_static/html/modal.html index 38eb673824f97e..e7bcc1c1c16c58 100644 --- a/docs/sphinx_setup/_static/html/modal.html +++ b/docs/sphinx_setup/_static/html/modal.html @@ -87,6 +87,6 @@

Graph Results

- + + \ No newline at end of file diff --git a/docs/sphinx_setup/_static/html/modalLLM.html b/docs/sphinx_setup/_static/html/modalLLM.html index 37b569d0bd4078..e8535c87f16090 100644 --- a/docs/sphinx_setup/_static/html/modalLLM.html +++ b/docs/sphinx_setup/_static/html/modalLLM.html @@ -87,6 +87,6 @@

Graph Results

- + + \ No newline at end of file diff --git a/docs/sphinx_setup/_static/js/custom.js b/docs/sphinx_setup/_static/js/custom.js index 241f8895ee1c61..ba43b64a24d89f 100644 --- a/docs/sphinx_setup/_static/js/custom.js +++ b/docs/sphinx_setup/_static/js/custom.js @@ -189,7 +189,7 @@ function getCurrentVersion() { if (wordAfterDomain === 'cn') { wordAfterDomain = link[2]; } - if (["index.html", "404.html", "", "latest"].indexOf(wordAfterDomain) >= 0) { + if (["index.html", "404.html", ""].indexOf(wordAfterDomain) >= 0) { /* * If this landing page, 404 or domain.com we should get first version * */ @@ -416,7 +416,7 @@ document.addEventListener('DOMContentLoaded', function () { } await element.initialize({ - accessToken: "xx1f2aebd3-4307-4632-aeea-17c13378b237", + accessToken: "xx2b580d60-addf-451d-94fd-06effafb7686", organizationId: "intelcorporationproductione78n25s6" }); @@ -426,7 +426,7 @@ document.addEventListener('DOMContentLoaded', function () { const searchInterfaceSa = document.querySelector("#sa-search"); const searchInterface = document.querySelector("#search"); const currentVersion = getCurrentVersion(); - + await initializeSearchInterface(searchInterfaceSa, currentVersion); await initializeSearchInterface(searchInterface); diff --git a/docs/sphinx_setup/_static/js/graphs.js b/docs/sphinx_setup/_static/js/graphs.js index 697911bad9402c..04e34d6c2fefe5 100644 --- a/docs/sphinx_setup/_static/js/graphs.js +++ b/docs/sphinx_setup/_static/js/graphs.js @@ -60,8 +60,8 @@ class Filter { // param: GraphData[], clientPlatforms[] static BySortPlatforms(graphDataArr, platformsArr) { return graphDataArr - .filter((data) => platformsArr.includes(data.Platform)) - .sort((a, b) => a.Platform.localeCompare(b.Platform)); + .filter((data) => platformsArr.includes(data.Platform)) + .sort((a, b) => a.Platform.localeCompare(b.Platform)); //sort is necessary } } @@ -145,8 +145,8 @@ class Graph { array.push([obj]) } }) - return array; + return array; } // this returns an object that is used to ender the chart @@ -283,13 +283,13 @@ $(document).ready(function () { const models = networkModels.map((networkModel) => createCheckMark(networkModel, 'networkmodel')); modal.find('.models-column').append(models); - const selectAllModelsButton = createCheckMark('', 'networkmodel', false , false); + const selectAllModelsButton = createCheckMark('', 'networkmodel', false, false); modal.find('.models-selectall').append(selectAllModelsButton); - const selectAllPlatformsButton = createCheckMark('', 'platform', false , false); + const selectAllPlatformsButton = createCheckMark('', 'platform', false, false); modal.find('.platforms-selectall').append(selectAllPlatformsButton); - const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false , false)); + const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false, false)); modal.find('.precisions-column').append(precisions); selectAllCheckboxes(precisions); @@ -304,7 +304,7 @@ $(document).ready(function () { modal.find('#modal-display-graphs').hide(); modal.find('.ietype-column input').first().prop('checked', true); - const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false , true)); + const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false, true)); modal.find('.kpi-column').append(kpiLabels); $('body').prepend(modal); @@ -511,6 +511,7 @@ $(document).ready(function () { listContainer.style.margin = 0; listContainer.style.padding = 0; listContainer.style.paddingLeft = '0px'; + listContainer.style.float = "right"; legendContainer.appendChild(listContainer); } @@ -521,57 +522,55 @@ $(document).ready(function () { const htmlLegendPlugin = { id: 'htmlLegend', afterUpdate(chart, args, options) { - + charts = [...new Set([...charts, ...[chart]])]; const ul = getOrCreateLegendList(chart, chart.options.plugins.htmlLegend.containerID); - // Remove old legend items while (ul.firstChild) { ul.firstChild.remove(); } - const items = chart.legend.legendItems; + const items = chart.options.plugins.legend.labels.generateLabels(chart); items.forEach(item => { const li = document.createElement('li'); li.style.alignItems = 'center'; li.style.display = 'block'; li.style.flexDirection = 'column'; - li.style.marginLeft = '4px'; - + li.style.marginLeft = '6px'; + li.style.cursor = "pointer"; + li.style.fontSize = '0.6rem'; + li.style.textDecoration = item.hidden ? 'line-through' : ''; li.onclick = () => { - chart.toggleDataVisibility(item.index); - chart.update(); + charts.forEach((chartItem) => { + chartItem.setDatasetVisibility(item.datasetIndex, !chartItem.isDatasetVisible(item.datasetIndex)); + chartItem.update(); + }) }; - - // Color box + const boxSpan = document.createElement('span'); boxSpan.style.background = item.fillStyle; boxSpan.style.borderColor = item.strokeStyle; - boxSpan.style.borderWidth = item.lineWidth + 'px'; boxSpan.style.display = 'inline-block'; boxSpan.style.height = '10px'; boxSpan.style.marginRight = '4px'; boxSpan.style.width = '30px'; - // Text - const textContainer = document.createElement('p'); - textContainer.style.color = '#666'; - textContainer.style.margin = 0; - textContainer.style.padding = 0; - textContainer.style.fontSize = '0.6rem'; - textContainer.style.marginLeft = '3px'; - textContainer.style.textDecoration = item.hidden ? 'line-through' : ''; + const textSpan = document.createElement('span'); + textSpan.style.bottom = '1px' + textSpan.style.position = 'relative' + textSpan.style.fontSize = '0.6rem'; + textSpan.style.textDecoration = item.hidden ? 'line-through' : ''; const text = document.createTextNode(item.text); - textContainer.appendChild(text); + textSpan.appendChild(text); li.appendChild(boxSpan); - li.appendChild(textContainer); + li.appendChild(textSpan); ul.appendChild(li); }); } }; - function getChartOptionsByEngines(containerId, allowedAxisIDs) { + function getChartOptionsByEngines(allowedAxisIDs) { const axisConfigs = { x: { title: { display: true, text: 'Request Rate' } @@ -602,11 +601,11 @@ $(document).ready(function () { }, {}), plugins: { legend: { display: false }, - htmlLegend: { containerID: containerId } + htmlLegend: { containerID: 'modal-footer' } } }; } - function getChartOptions(title, containerId) { + function getChartOptions(title) { return { responsive: true, indexAxis: 'y', @@ -633,7 +632,7 @@ $(document).ready(function () { display: false }, htmlLegend: { - containerID: containerId, + containerID: 'modal-footer', } } } @@ -838,7 +837,7 @@ $(document).ready(function () { new Chart(context, { type: 'bar', data: getChartData(labels, datasets), - options: getChartOptions(chartTitle, containerId), + options: getChartOptions(chartTitle), plugins: [htmlLegendPlugin] }); }); @@ -858,9 +857,9 @@ $(document).ready(function () { }) } } - + var charts = []; function processMetricByEngines(labels, datasets, container, widthClass, id) { - var heightRatio = (80 + (labels.length * 55)); + var heightRatio = (30 + (labels.length * 55)); var chart = $('
'); const containerId = `legend-container-${id}`; const legend = $(`
`); @@ -894,8 +893,7 @@ $(document).ready(function () { backgroundColor: precision.color, yAxisID: precision.label === "Throughput" ? 'y' : 'y1', fill: false - } - ) + }) }) }) @@ -914,9 +912,10 @@ $(document).ready(function () { labels: labels, datasets: graphDatas }, - options: getChartOptionsByEngines(containerId, allowedAxisIDs), + options: getChartOptionsByEngines(allowedAxisIDs), plugins: [htmlLegendPlugin] }); + }); } diff --git a/docs/sphinx_setup/_static/js/openVinoDataTables.js b/docs/sphinx_setup/_static/js/openVinoDataTables.js index c65748065b2ad2..bd56a71533786c 100644 --- a/docs/sphinx_setup/_static/js/openVinoDataTables.js +++ b/docs/sphinx_setup/_static/js/openVinoDataTables.js @@ -1,48 +1,41 @@ $(document).ready(function () { - var pageTitle = document.title; - var columnDefs; - if(pageTitle.includes('Most Efficient Large Language Models for AI PC')) - { - columnDefs= [ - { "visible": false, "targets": [3,4,6] } - ] - } - else - { - columnDefs=[] - } + var columnDefs = []; - var table = $('table.modeldata').DataTable({ - responsive: true, - "autoWidth": false, - stateSave: true, - language: { - buttons: { - colvisRestore: "Show all columns" - } - }, - lengthMenu: [ - [10, 25, 50, -1], - ['10 rows', '25 rows', '50 rows', 'Show all rows'] - ], - "columnDefs": columnDefs, - layout: { - topStart: { - buttons: [ - 'pageLength', - { - extend: 'colvis', - postfixButtons: ['colvisRestore'], - }, - { - extend: 'print', - text: 'Print pdf', - exportOptions: { - columns: ':visible' + var tables = $('table.modeldata'); + for (let table of tables) { + var hidden = table.getAttribute('data-columns-hidden'); + columnDefs = [{ "visible": false, "targets": JSON.parse(hidden) }] + $(table).DataTable({ + responsive: true, + "autoWidth": false, + language: { + buttons: { + colvisRestore: "Restore default" + } + }, + lengthMenu: [ + [10, 25, 50, -1], + ['10 rows', '25 rows', '50 rows', 'Show all records'] + ], + "columnDefs": columnDefs, + layout: { + topStart: { + buttons: [ + 'pageLength', + { + extend: 'colvis', + postfixButtons: ['colvisRestore'], + }, + { + extend: 'print', + text: 'Print pdf', + exportOptions: { + columns: ':visible' + } } - } - ] + ] + } } - } - }); + }); + } }); \ No newline at end of file diff --git a/docs/sphinx_setup/_static/selector-tool/assets/selector-DiE3WrtX.js b/docs/sphinx_setup/_static/selector-tool/assets/selector-DiE3WrtX.js new file mode 100644 index 00000000000000..264f23f1dd17e3 --- /dev/null +++ b/docs/sphinx_setup/_static/selector-tool/assets/selector-DiE3WrtX.js @@ -0,0 +1,59 @@ +var Vd=Object.defineProperty;var bd=(e,t,n)=>t in e?Vd(e,t,{enumerable:!0,configurable:!0,writable:!0,value:n}):e[t]=n;var ze=(e,t,n)=>bd(e,typeof t!="symbol"?t+"":t,n);function qu(e){return e&&e.__esModule&&Object.prototype.hasOwnProperty.call(e,"default")?e.default:e}var ec={exports:{}},lo={},tc={exports:{}},D={};/** + * @license React + * react.production.min.js + * + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var Hr=Symbol.for("react.element"),$d=Symbol.for("react.portal"),Md=Symbol.for("react.fragment"),Bd=Symbol.for("react.strict_mode"),Kd=Symbol.for("react.profiler"),Hd=Symbol.for("react.provider"),Gd=Symbol.for("react.context"),Wd=Symbol.for("react.forward_ref"),Yd=Symbol.for("react.suspense"),Qd=Symbol.for("react.memo"),Jd=Symbol.for("react.lazy"),da=Symbol.iterator;function Xd(e){return e===null||typeof e!="object"?null:(e=da&&e[da]||e["@@iterator"],typeof e=="function"?e:null)}var nc={isMounted:function(){return!1},enqueueForceUpdate:function(){},enqueueReplaceState:function(){},enqueueSetState:function(){}},rc=Object.assign,ic={};function qn(e,t,n){this.props=e,this.context=t,this.refs=ic,this.updater=n||nc}qn.prototype.isReactComponent={};qn.prototype.setState=function(e,t){if(typeof e!="object"&&typeof e!="function"&&e!=null)throw Error("setState(...): takes an object of state variables to update or a function which returns an object of state variables.");this.updater.enqueueSetState(this,e,t,"setState")};qn.prototype.forceUpdate=function(e){this.updater.enqueueForceUpdate(this,e,"forceUpdate")};function oc(){}oc.prototype=qn.prototype;function ll(e,t,n){this.props=e,this.context=t,this.refs=ic,this.updater=n||nc}var al=ll.prototype=new oc;al.constructor=ll;rc(al,qn.prototype);al.isPureReactComponent=!0;var pa=Array.isArray,sc=Object.prototype.hasOwnProperty,ul={current:null},lc={key:!0,ref:!0,__self:!0,__source:!0};function ac(e,t,n){var r,i={},o=null,s=null;if(t!=null)for(r in t.ref!==void 0&&(s=t.ref),t.key!==void 0&&(o=""+t.key),t)sc.call(t,r)&&!lc.hasOwnProperty(r)&&(i[r]=t[r]);var l=arguments.length-2;if(l===1)i.children=n;else if(1{const e={type:"size",height:document.body.offsetHeight};window.parent.postMessage(e)};new ResizeObserver(up).observe(document.body);function ue(e){"@babel/helpers - typeof";return ue=typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?function(t){return typeof t}:function(t){return t&&typeof Symbol=="function"&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},ue(e)}function ct(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}function cp(e,t){if(ue(e)!=="object"||e===null)return e;var n=e[Symbol.toPrimitive];if(n!==void 0){var r=n.call(e,t||"default");if(ue(r)!=="object")return r;throw new TypeError("@@toPrimitive must return a primitive value.")}return(t==="string"?String:Number)(e)}function cc(e){var t=cp(e,"string");return ue(t)==="symbol"?t:String(t)}function ga(e,t){for(var n=0;ne.length)&&(t=e.length);for(var n=0,r=new Array(t);n1&&arguments[1]!==void 0?arguments[1]:{};ct(this,e),this.init(t,n)}return ft(e,[{key:"init",value:function(n){var r=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{};this.prefix=r.prefix||"i18next:",this.logger=n||pp,this.options=r,this.debug=r.debug}},{key:"setDebug",value:function(n){this.debug=n}},{key:"log",value:function(){for(var n=arguments.length,r=new Array(n),i=0;i1?r-1:0),o=1;o-1?l.replace(/###/g,"."):l}function i(){return!e||typeof e=="string"}for(var o=typeof t!="string"?[].concat(t):t.split(".");o.length>1;){if(i())return{};var s=r(o.shift());!e[s]&&n&&(e[s]=new n),Object.prototype.hasOwnProperty.call(e,s)?e=e[s]:e={}}return i()?{}:{obj:e,k:r(o.shift())}}function ka(e,t,n){var r=fl(e,t,Object),i=r.obj,o=r.k;i[o]=n}function mp(e,t,n,r){var i=fl(e,t,Object),o=i.obj,s=i.k;o[s]=o[s]||[],o[s].push(n)}function Ai(e,t){var n=fl(e,t),r=n.obj,i=n.k;if(r)return r[i]}function Sa(e,t,n){var r=Ai(e,n);return r!==void 0?r:Ai(t,n)}function hc(e,t,n){for(var r in t)r!=="__proto__"&&r!=="constructor"&&(r in e?typeof e[r]=="string"||e[r]instanceof String||typeof t[r]=="string"||t[r]instanceof String?n&&(e[r]=t[r]):hc(e[r],t[r],n):e[r]=t[r]);return e}function On(e){return e.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g,"\\$&")}var vp={"&":"&","<":"<",">":">",'"':""","'":"'","/":"/"};function yp(e){return typeof e=="string"?e.replace(/[&<>"'\/]/g,function(t){return vp[t]}):e}var uo=typeof window<"u"&&window.navigator&&typeof window.navigator.userAgentData>"u"&&window.navigator.userAgent&&window.navigator.userAgent.indexOf("MSIE")>-1,wp=[" ",",","?","!",";"];function kp(e,t,n){t=t||"",n=n||"";var r=wp.filter(function(l){return t.indexOf(l)<0&&n.indexOf(l)<0});if(r.length===0)return!0;var i=new RegExp("(".concat(r.map(function(l){return l==="?"?"\\?":l}).join("|"),")")),o=!i.test(e);if(!o){var s=e.indexOf(n);s>0&&!i.test(e.substring(0,s))&&(o=!0)}return o}function xa(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter(function(i){return Object.getOwnPropertyDescriptor(e,i).enumerable})),n.push.apply(n,r)}return n}function ni(e){for(var t=1;t"u"||!Reflect.construct||Reflect.construct.sham)return!1;if(typeof Proxy=="function")return!0;try{return Boolean.prototype.valueOf.call(Reflect.construct(Boolean,[],function(){})),!0}catch{return!1}}function gc(e,t){var n=arguments.length>2&&arguments[2]!==void 0?arguments[2]:".";if(e){if(e[t])return e[t];for(var r=t.split(n),i=e,o=0;oo+s;)s++,l=r.slice(o,o+s).join(n),a=i[l];if(a===void 0)return;if(a===null)return null;if(t.endsWith(l)){if(typeof a=="string")return a;if(l&&typeof a[l]=="string")return a[l]}var c=r.slice(o+s).join(n);return c?gc(a,c,n):void 0}i=i[r[o]]}return i}}var Op=function(e){ao(n,e);var t=Sp(n);function n(r){var i,o=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{ns:["translation"],defaultNS:"translation"};return ct(this,n),i=t.call(this),uo&&Xt.call(Mt(i)),i.data=r||{},i.options=o,i.options.keySeparator===void 0&&(i.options.keySeparator="."),i.options.ignoreJSONStructure===void 0&&(i.options.ignoreJSONStructure=!0),i}return ft(n,[{key:"addNamespaces",value:function(i){this.options.ns.indexOf(i)<0&&this.options.ns.push(i)}},{key:"removeNamespaces",value:function(i){var o=this.options.ns.indexOf(i);o>-1&&this.options.ns.splice(o,1)}},{key:"getResource",value:function(i,o,s){var l=arguments.length>3&&arguments[3]!==void 0?arguments[3]:{},a=l.keySeparator!==void 0?l.keySeparator:this.options.keySeparator,c=l.ignoreJSONStructure!==void 0?l.ignoreJSONStructure:this.options.ignoreJSONStructure,p=[i,o];s&&typeof s!="string"&&(p=p.concat(s)),s&&typeof s=="string"&&(p=p.concat(a?s.split(a):s)),i.indexOf(".")>-1&&(p=i.split("."));var d=Ai(this.data,p);return d||!c||typeof s!="string"?d:gc(this.data&&this.data[i]&&this.data[i][o],s,a)}},{key:"addResource",value:function(i,o,s,l){var a=arguments.length>4&&arguments[4]!==void 0?arguments[4]:{silent:!1},c=this.options.keySeparator;c===void 0&&(c=".");var p=[i,o];s&&(p=p.concat(c?s.split(c):s)),i.indexOf(".")>-1&&(p=i.split("."),l=o,o=p[1]),this.addNamespaces(o),ka(this.data,p,l),a.silent||this.emit("added",i,o,s,l)}},{key:"addResources",value:function(i,o,s){var l=arguments.length>3&&arguments[3]!==void 0?arguments[3]:{silent:!1};for(var a in s)(typeof s[a]=="string"||Object.prototype.toString.apply(s[a])==="[object Array]")&&this.addResource(i,o,a,s[a],{silent:!0});l.silent||this.emit("added",i,o,s)}},{key:"addResourceBundle",value:function(i,o,s,l,a){var c=arguments.length>5&&arguments[5]!==void 0?arguments[5]:{silent:!1},p=[i,o];i.indexOf(".")>-1&&(p=i.split("."),l=s,s=o,o=p[1]),this.addNamespaces(o);var d=Ai(this.data,p)||{};l?hc(d,s,a):d=ni(ni({},d),s),ka(this.data,p,d),c.silent||this.emit("added",i,o,s)}},{key:"removeResourceBundle",value:function(i,o){this.hasResourceBundle(i,o)&&delete this.data[i][o],this.removeNamespaces(o),this.emit("removed",i,o)}},{key:"hasResourceBundle",value:function(i,o){return this.getResource(i,o)!==void 0}},{key:"getResourceBundle",value:function(i,o){return o||(o=this.options.defaultNS),this.options.compatibilityAPI==="v1"?ni(ni({},{}),this.getResource(i,o)):this.getResource(i,o)}},{key:"getDataByLanguage",value:function(i){return this.data[i]}},{key:"hasLanguageSomeTranslations",value:function(i){var o=this.getDataByLanguage(i),s=o&&Object.keys(o)||[];return!!s.find(function(l){return o[l]&&Object.keys(o[l]).length>0})}},{key:"toJSON",value:function(){return this.data}}]),n}(Xt),mc={processors:{},addPostProcessor:function(t){this.processors[t.name]=t},handle:function(t,n,r,i,o){var s=this;return t.forEach(function(l){s.processors[l]&&(n=s.processors[l].process(n,r,i,o))}),n}};function Oa(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter(function(i){return Object.getOwnPropertyDescriptor(e,i).enumerable})),n.push.apply(n,r)}return n}function ve(e){for(var t=1;t"u"||!Reflect.construct||Reflect.construct.sham)return!1;if(typeof Proxy=="function")return!0;try{return Boolean.prototype.valueOf.call(Reflect.construct(Boolean,[],function(){})),!0}catch{return!1}}var Pa={},Na=function(e){ao(n,e);var t=Pp(n);function n(r){var i,o=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{};return ct(this,n),i=t.call(this),uo&&Xt.call(Mt(i)),gp(["resourceStore","languageUtils","pluralResolver","interpolator","backendConnector","i18nFormat","utils"],r,Mt(i)),i.options=o,i.options.keySeparator===void 0&&(i.options.keySeparator="."),i.logger=vt.create("translator"),i}return ft(n,[{key:"changeLanguage",value:function(i){i&&(this.language=i)}},{key:"exists",value:function(i){var o=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{interpolation:{}};if(i==null)return!1;var s=this.resolve(i,o);return s&&s.res!==void 0}},{key:"extractFromKey",value:function(i,o){var s=o.nsSeparator!==void 0?o.nsSeparator:this.options.nsSeparator;s===void 0&&(s=":");var l=o.keySeparator!==void 0?o.keySeparator:this.options.keySeparator,a=o.ns||this.options.defaultNS||[],c=s&&i.indexOf(s)>-1,p=!this.options.userDefinedKeySeparator&&!o.keySeparator&&!this.options.userDefinedNsSeparator&&!o.nsSeparator&&!kp(i,s,l);if(c&&!p){var d=i.match(this.interpolator.nestingRegexp);if(d&&d.length>0)return{key:i,namespaces:a};var h=i.split(s);(s!==l||s===l&&this.options.ns.indexOf(h[0])>-1)&&(a=h.shift()),i=h.join(l)}return typeof a=="string"&&(a=[a]),{key:i,namespaces:a}}},{key:"translate",value:function(i,o,s){var l=this;if(ue(o)!=="object"&&this.options.overloadTranslationOptionHandler&&(o=this.options.overloadTranslationOptionHandler(arguments)),o||(o={}),i==null)return"";Array.isArray(i)||(i=[String(i)]);var a=o.returnDetails!==void 0?o.returnDetails:this.options.returnDetails,c=o.keySeparator!==void 0?o.keySeparator:this.options.keySeparator,p=this.extractFromKey(i[i.length-1],o),d=p.key,h=p.namespaces,v=h[h.length-1],y=o.lng||this.language,k=o.appendNamespaceToCIMode||this.options.appendNamespaceToCIMode;if(y&&y.toLowerCase()==="cimode"){if(k){var O=o.nsSeparator||this.options.nsSeparator;return a?{res:"".concat(v).concat(O).concat(d),usedKey:d,exactUsedKey:d,usedLng:y,usedNS:v}:"".concat(v).concat(O).concat(d)}return a?{res:d,usedKey:d,exactUsedKey:d,usedLng:y,usedNS:v}:d}var f=this.resolve(i,o),u=f&&f.res,g=f&&f.usedKey||d,w=f&&f.exactUsedKey||d,x=Object.prototype.toString.apply(u),S=["[object Number]","[object Function]","[object RegExp]"],N=o.joinArrays!==void 0?o.joinArrays:this.options.joinArrays,_=!this.i18nFormat||this.i18nFormat.handleAsObject,L=typeof u!="string"&&typeof u!="boolean"&&typeof u!="number";if(_&&u&&L&&S.indexOf(x)<0&&!(typeof N=="string"&&x==="[object Array]")){if(!o.returnObjects&&!this.options.returnObjects){this.options.returnedObjectHandler||this.logger.warn("accessing an object - but returnObjects options is not enabled!");var E=this.options.returnedObjectHandler?this.options.returnedObjectHandler(g,u,ve(ve({},o),{},{ns:h})):"key '".concat(d," (").concat(this.language,")' returned an object instead of string.");return a?(f.res=E,f):E}if(c){var K=x==="[object Array]",Ce=K?[]:{},St=K?w:g;for(var et in u)if(Object.prototype.hasOwnProperty.call(u,et)){var kn="".concat(St).concat(c).concat(et);Ce[et]=this.translate(kn,ve(ve({},o),{joinArrays:!1,ns:h})),Ce[et]===kn&&(Ce[et]=u[et])}u=Ce}}else if(_&&typeof N=="string"&&x==="[object Array]")u=u.join(N),u&&(u=this.extendTranslation(u,i,o,s));else{var dt=!1,tt=!1,C=o.count!==void 0&&typeof o.count!="string",I=n.hasDefaultValue(o),R=C?this.pluralResolver.getSuffix(y,o.count,o):"",V=o["defaultValue".concat(R)]||o.defaultValue;!this.isValidLookup(u)&&I&&(dt=!0,u=V),this.isValidLookup(u)||(tt=!0,u=d);var Y=o.missingKeyNoValueFallbackToKey||this.options.missingKeyNoValueFallbackToKey,xt=Y&&tt?void 0:u,Ue=I&&V!==u&&this.options.updateMissing;if(tt||dt||Ue){if(this.logger.log(Ue?"updateKey":"missingKey",y,v,d,Ue?V:u),c){var Sn=this.resolve(d,ve(ve({},o),{},{keySeparator:!1}));Sn&&Sn.res&&this.logger.warn("Seems the loaded translations were in flat JSON format instead of nested. Either set keySeparator: false on init or make sure your translations are published in nested format.")}var Fe=[],Ot=this.languageUtils.getFallbackCodes(this.options.fallbackLng,o.lng||this.language);if(this.options.saveMissingTo==="fallback"&&Ot&&Ot[0])for(var _o=0;_o1&&arguments[1]!==void 0?arguments[1]:{},l,a,c,p,d;return typeof i=="string"&&(i=[i]),i.forEach(function(h){if(!o.isValidLookup(l)){var v=o.extractFromKey(h,s),y=v.key;a=y;var k=v.namespaces;o.options.fallbackNS&&(k=k.concat(o.options.fallbackNS));var O=s.count!==void 0&&typeof s.count!="string",f=O&&!s.ordinal&&s.count===0&&o.pluralResolver.shouldUseIntlApi(),u=s.context!==void 0&&(typeof s.context=="string"||typeof s.context=="number")&&s.context!=="",g=s.lngs?s.lngs:o.languageUtils.toResolveHierarchy(s.lng||o.language,s.fallbackLng);k.forEach(function(w){o.isValidLookup(l)||(d=w,!Pa["".concat(g[0],"-").concat(w)]&&o.utils&&o.utils.hasLoadedNamespace&&!o.utils.hasLoadedNamespace(d)&&(Pa["".concat(g[0],"-").concat(w)]=!0,o.logger.warn('key "'.concat(a,'" for languages "').concat(g.join(", "),`" won't get resolved as namespace "`).concat(d,'" was not yet loaded'),"This means something IS WRONG in your setup. You access the t function before i18next.init / i18next.loadNamespace / i18next.changeLanguage was done. Wait for the callback or Promise to resolve before accessing it!!!")),g.forEach(function(x){if(!o.isValidLookup(l)){p=x;var S=[y];if(o.i18nFormat&&o.i18nFormat.addLookupKeys)o.i18nFormat.addLookupKeys(S,y,x,w,s);else{var N;O&&(N=o.pluralResolver.getSuffix(x,s.count,s));var _="".concat(o.options.pluralSeparator,"zero");if(O&&(S.push(y+N),f&&S.push(y+_)),u){var L="".concat(y).concat(o.options.contextSeparator).concat(s.context);S.push(L),O&&(S.push(L+N),f&&S.push(L+_))}}for(var E;E=S.pop();)o.isValidLookup(l)||(c=E,l=o.getResource(x,w,E,s))}}))})}}),{res:l,usedKey:a,exactUsedKey:c,usedLng:p,usedNS:d}}},{key:"isValidLookup",value:function(i){return i!==void 0&&!(!this.options.returnNull&&i===null)&&!(!this.options.returnEmptyString&&i==="")}},{key:"getResource",value:function(i,o,s){var l=arguments.length>3&&arguments[3]!==void 0?arguments[3]:{};return this.i18nFormat&&this.i18nFormat.getResource?this.i18nFormat.getResource(i,o,s,l):this.resourceStore.getResource(i,o,s,l)}}],[{key:"hasDefaultValue",value:function(i){var o="defaultValue";for(var s in i)if(Object.prototype.hasOwnProperty.call(i,s)&&o===s.substring(0,o.length)&&i[s]!==void 0)return!0;return!1}}]),n}(Xt);function Io(e){return e.charAt(0).toUpperCase()+e.slice(1)}var _a=function(){function e(t){ct(this,e),this.options=t,this.supportedLngs=this.options.supportedLngs||!1,this.logger=vt.create("languageUtils")}return ft(e,[{key:"getScriptPartFromCode",value:function(n){if(!n||n.indexOf("-")<0)return null;var r=n.split("-");return r.length===2||(r.pop(),r[r.length-1].toLowerCase()==="x")?null:this.formatLanguageCode(r.join("-"))}},{key:"getLanguagePartFromCode",value:function(n){if(!n||n.indexOf("-")<0)return n;var r=n.split("-");return this.formatLanguageCode(r[0])}},{key:"formatLanguageCode",value:function(n){if(typeof n=="string"&&n.indexOf("-")>-1){var r=["hans","hant","latn","cyrl","cans","mong","arab"],i=n.split("-");return this.options.lowerCaseLng?i=i.map(function(o){return o.toLowerCase()}):i.length===2?(i[0]=i[0].toLowerCase(),i[1]=i[1].toUpperCase(),r.indexOf(i[1].toLowerCase())>-1&&(i[1]=Io(i[1].toLowerCase()))):i.length===3&&(i[0]=i[0].toLowerCase(),i[1].length===2&&(i[1]=i[1].toUpperCase()),i[0]!=="sgn"&&i[2].length===2&&(i[2]=i[2].toUpperCase()),r.indexOf(i[1].toLowerCase())>-1&&(i[1]=Io(i[1].toLowerCase())),r.indexOf(i[2].toLowerCase())>-1&&(i[2]=Io(i[2].toLowerCase()))),i.join("-")}return this.options.cleanCode||this.options.lowerCaseLng?n.toLowerCase():n}},{key:"isSupportedCode",value:function(n){return(this.options.load==="languageOnly"||this.options.nonExplicitSupportedLngs)&&(n=this.getLanguagePartFromCode(n)),!this.supportedLngs||!this.supportedLngs.length||this.supportedLngs.indexOf(n)>-1}},{key:"getBestMatchFromCodes",value:function(n){var r=this;if(!n)return null;var i;return n.forEach(function(o){if(!i){var s=r.formatLanguageCode(o);(!r.options.supportedLngs||r.isSupportedCode(s))&&(i=s)}}),!i&&this.options.supportedLngs&&n.forEach(function(o){if(!i){var s=r.getLanguagePartFromCode(o);if(r.isSupportedCode(s))return i=s;i=r.options.supportedLngs.find(function(l){if(l.indexOf(s)===0)return l})}}),i||(i=this.getFallbackCodes(this.options.fallbackLng)[0]),i}},{key:"getFallbackCodes",value:function(n,r){if(!n)return[];if(typeof n=="function"&&(n=n(r)),typeof n=="string"&&(n=[n]),Object.prototype.toString.apply(n)==="[object Array]")return n;if(!r)return n.default||[];var i=n[r];return i||(i=n[this.getScriptPartFromCode(r)]),i||(i=n[this.formatLanguageCode(r)]),i||(i=n[this.getLanguagePartFromCode(r)]),i||(i=n.default),i||[]}},{key:"toResolveHierarchy",value:function(n,r){var i=this,o=this.getFallbackCodes(r||this.options.fallbackLng||[],n),s=[],l=function(c){c&&(i.isSupportedCode(c)?s.push(c):i.logger.warn("rejecting language code not found in supportedLngs: ".concat(c)))};return typeof n=="string"&&n.indexOf("-")>-1?(this.options.load!=="languageOnly"&&l(this.formatLanguageCode(n)),this.options.load!=="languageOnly"&&this.options.load!=="currentOnly"&&l(this.getScriptPartFromCode(n)),this.options.load!=="currentOnly"&&l(this.getLanguagePartFromCode(n))):typeof n=="string"&&l(this.formatLanguageCode(n)),o.forEach(function(a){s.indexOf(a)<0&&l(i.formatLanguageCode(a))}),s}}]),e}(),_p=[{lngs:["ach","ak","am","arn","br","fil","gun","ln","mfe","mg","mi","oc","pt","pt-BR","tg","tl","ti","tr","uz","wa"],nr:[1,2],fc:1},{lngs:["af","an","ast","az","bg","bn","ca","da","de","dev","el","en","eo","es","et","eu","fi","fo","fur","fy","gl","gu","ha","hi","hu","hy","ia","it","kk","kn","ku","lb","mai","ml","mn","mr","nah","nap","nb","ne","nl","nn","no","nso","pa","pap","pms","ps","pt-PT","rm","sco","se","si","so","son","sq","sv","sw","ta","te","tk","ur","yo"],nr:[1,2],fc:2},{lngs:["ay","bo","cgg","fa","ht","id","ja","jbo","ka","km","ko","ky","lo","ms","sah","su","th","tt","ug","vi","wo","zh"],nr:[1],fc:3},{lngs:["be","bs","cnr","dz","hr","ru","sr","uk"],nr:[1,2,5],fc:4},{lngs:["ar"],nr:[0,1,2,3,11,100],fc:5},{lngs:["cs","sk"],nr:[1,2,5],fc:6},{lngs:["csb","pl"],nr:[1,2,5],fc:7},{lngs:["cy"],nr:[1,2,3,8],fc:8},{lngs:["fr"],nr:[1,2],fc:9},{lngs:["ga"],nr:[1,2,3,7,11],fc:10},{lngs:["gd"],nr:[1,2,3,20],fc:11},{lngs:["is"],nr:[1,2],fc:12},{lngs:["jv"],nr:[0,1],fc:13},{lngs:["kw"],nr:[1,2,3,4],fc:14},{lngs:["lt"],nr:[1,2,10],fc:15},{lngs:["lv"],nr:[1,2,0],fc:16},{lngs:["mk"],nr:[1,2],fc:17},{lngs:["mnk"],nr:[0,1,2],fc:18},{lngs:["mt"],nr:[1,2,11,20],fc:19},{lngs:["or"],nr:[2,1],fc:2},{lngs:["ro"],nr:[1,2,20],fc:20},{lngs:["sl"],nr:[5,1,2,3],fc:21},{lngs:["he","iw"],nr:[1,2,20,21],fc:22}],Ep={1:function(t){return+(t>1)},2:function(t){return+(t!=1)},3:function(t){return 0},4:function(t){return t%10==1&&t%100!=11?0:t%10>=2&&t%10<=4&&(t%100<10||t%100>=20)?1:2},5:function(t){return t==0?0:t==1?1:t==2?2:t%100>=3&&t%100<=10?3:t%100>=11?4:5},6:function(t){return t==1?0:t>=2&&t<=4?1:2},7:function(t){return t==1?0:t%10>=2&&t%10<=4&&(t%100<10||t%100>=20)?1:2},8:function(t){return t==1?0:t==2?1:t!=8&&t!=11?2:3},9:function(t){return+(t>=2)},10:function(t){return t==1?0:t==2?1:t<7?2:t<11?3:4},11:function(t){return t==1||t==11?0:t==2||t==12?1:t>2&&t<20?2:3},12:function(t){return+(t%10!=1||t%100==11)},13:function(t){return+(t!==0)},14:function(t){return t==1?0:t==2?1:t==3?2:3},15:function(t){return t%10==1&&t%100!=11?0:t%10>=2&&(t%100<10||t%100>=20)?1:2},16:function(t){return t%10==1&&t%100!=11?0:t!==0?1:2},17:function(t){return t==1||t%10==1&&t%100!=11?0:1},18:function(t){return t==0?0:t==1?1:2},19:function(t){return t==1?0:t==0||t%100>1&&t%100<11?1:t%100>10&&t%100<20?2:3},20:function(t){return t==1?0:t==0||t%100>0&&t%100<20?1:2},21:function(t){return t%100==1?1:t%100==2?2:t%100==3||t%100==4?3:0},22:function(t){return t==1?0:t==2?1:(t<0||t>10)&&t%10==0?2:3}},Cp=["v1","v2","v3"],Ea={zero:0,one:1,two:2,few:3,many:4,other:5};function jp(){var e={};return _p.forEach(function(t){t.lngs.forEach(function(n){e[n]={numbers:t.nr,plurals:Ep[t.fc]}})}),e}var Ip=function(){function e(t){var n=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{};ct(this,e),this.languageUtils=t,this.options=n,this.logger=vt.create("pluralResolver"),(!this.options.compatibilityJSON||this.options.compatibilityJSON==="v4")&&(typeof Intl>"u"||!Intl.PluralRules)&&(this.options.compatibilityJSON="v3",this.logger.error("Your environment seems not to be Intl API compatible, use an Intl.PluralRules polyfill. Will fallback to the compatibilityJSON v3 format handling.")),this.rules=jp()}return ft(e,[{key:"addRule",value:function(n,r){this.rules[n]=r}},{key:"getRule",value:function(n){var r=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{};if(this.shouldUseIntlApi())try{return new Intl.PluralRules(n,{type:r.ordinal?"ordinal":"cardinal"})}catch{return}return this.rules[n]||this.rules[this.languageUtils.getLanguagePartFromCode(n)]}},{key:"needsPlural",value:function(n){var r=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{},i=this.getRule(n,r);return this.shouldUseIntlApi()?i&&i.resolvedOptions().pluralCategories.length>1:i&&i.numbers.length>1}},{key:"getPluralFormsOfKey",value:function(n,r){var i=arguments.length>2&&arguments[2]!==void 0?arguments[2]:{};return this.getSuffixes(n,i).map(function(o){return"".concat(r).concat(o)})}},{key:"getSuffixes",value:function(n){var r=this,i=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{},o=this.getRule(n,i);return o?this.shouldUseIntlApi()?o.resolvedOptions().pluralCategories.sort(function(s,l){return Ea[s]-Ea[l]}).map(function(s){return"".concat(r.options.prepend).concat(s)}):o.numbers.map(function(s){return r.getSuffix(n,s,i)}):[]}},{key:"getSuffix",value:function(n,r){var i=arguments.length>2&&arguments[2]!==void 0?arguments[2]:{},o=this.getRule(n,i);return o?this.shouldUseIntlApi()?"".concat(this.options.prepend).concat(o.select(r)):this.getSuffixRetroCompatible(o,r):(this.logger.warn("no plural rule found for: ".concat(n)),"")}},{key:"getSuffixRetroCompatible",value:function(n,r){var i=this,o=n.noAbs?n.plurals(r):n.plurals(Math.abs(r)),s=n.numbers[o];this.options.simplifyPluralSuffix&&n.numbers.length===2&&n.numbers[0]===1&&(s===2?s="plural":s===1&&(s=""));var l=function(){return i.options.prepend&&s.toString()?i.options.prepend+s.toString():s.toString()};return this.options.compatibilityJSON==="v1"?s===1?"":typeof s=="number"?"_plural_".concat(s.toString()):l():this.options.compatibilityJSON==="v2"||this.options.simplifyPluralSuffix&&n.numbers.length===2&&n.numbers[0]===1?l():this.options.prepend&&o.toString()?this.options.prepend+o.toString():o.toString()}},{key:"shouldUseIntlApi",value:function(){return!Cp.includes(this.options.compatibilityJSON)}}]),e}();function Ca(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter(function(i){return Object.getOwnPropertyDescriptor(e,i).enumerable})),n.push.apply(n,r)}return n}function nt(e){for(var t=1;t0&&arguments[0]!==void 0?arguments[0]:{};ct(this,e),this.logger=vt.create("interpolator"),this.options=t,this.format=t.interpolation&&t.interpolation.format||function(n){return n},this.init(t)}return ft(e,[{key:"init",value:function(){var n=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{};n.interpolation||(n.interpolation={escapeValue:!0});var r=n.interpolation;this.escape=r.escape!==void 0?r.escape:yp,this.escapeValue=r.escapeValue!==void 0?r.escapeValue:!0,this.useRawValueToEscape=r.useRawValueToEscape!==void 0?r.useRawValueToEscape:!1,this.prefix=r.prefix?On(r.prefix):r.prefixEscaped||"{{",this.suffix=r.suffix?On(r.suffix):r.suffixEscaped||"}}",this.formatSeparator=r.formatSeparator?r.formatSeparator:r.formatSeparator||",",this.unescapePrefix=r.unescapeSuffix?"":r.unescapePrefix||"-",this.unescapeSuffix=this.unescapePrefix?"":r.unescapeSuffix||"",this.nestingPrefix=r.nestingPrefix?On(r.nestingPrefix):r.nestingPrefixEscaped||On("$t("),this.nestingSuffix=r.nestingSuffix?On(r.nestingSuffix):r.nestingSuffixEscaped||On(")"),this.nestingOptionsSeparator=r.nestingOptionsSeparator?r.nestingOptionsSeparator:r.nestingOptionsSeparator||",",this.maxReplaces=r.maxReplaces?r.maxReplaces:1e3,this.alwaysFormat=r.alwaysFormat!==void 0?r.alwaysFormat:!1,this.resetRegExp()}},{key:"reset",value:function(){this.options&&this.init(this.options)}},{key:"resetRegExp",value:function(){var n="".concat(this.prefix,"(.+?)").concat(this.suffix);this.regexp=new RegExp(n,"g");var r="".concat(this.prefix).concat(this.unescapePrefix,"(.+?)").concat(this.unescapeSuffix).concat(this.suffix);this.regexpUnescape=new RegExp(r,"g");var i="".concat(this.nestingPrefix,"(.+?)").concat(this.nestingSuffix);this.nestingRegexp=new RegExp(i,"g")}},{key:"interpolate",value:function(n,r,i,o){var s=this,l,a,c,p=this.options&&this.options.interpolation&&this.options.interpolation.defaultVariables||{};function d(O){return O.replace(/\$/g,"$$$$")}var h=function(f){if(f.indexOf(s.formatSeparator)<0){var u=Sa(r,p,f);return s.alwaysFormat?s.format(u,void 0,i,nt(nt(nt({},o),r),{},{interpolationkey:f})):u}var g=f.split(s.formatSeparator),w=g.shift().trim(),x=g.join(s.formatSeparator).trim();return s.format(Sa(r,p,w),x,i,nt(nt(nt({},o),r),{},{interpolationkey:w}))};this.resetRegExp();var v=o&&o.missingInterpolationHandler||this.options.missingInterpolationHandler,y=o&&o.interpolation&&o.interpolation.skipOnVariables!==void 0?o.interpolation.skipOnVariables:this.options.interpolation.skipOnVariables,k=[{regex:this.regexpUnescape,safeValue:function(f){return d(f)}},{regex:this.regexp,safeValue:function(f){return s.escapeValue?d(s.escape(f)):d(f)}}];return k.forEach(function(O){for(c=0;l=O.regex.exec(n);){var f=l[1].trim();if(a=h(f),a===void 0)if(typeof v=="function"){var u=v(n,l,o);a=typeof u=="string"?u:""}else if(o&&Object.prototype.hasOwnProperty.call(o,f))a="";else if(y){a=l[0];continue}else s.logger.warn("missed to pass in variable ".concat(f," for interpolating ").concat(n)),a="";else typeof a!="string"&&!s.useRawValueToEscape&&(a=wa(a));var g=O.safeValue(a);if(n=n.replace(l[0],g),y?(O.regex.lastIndex+=a.length,O.regex.lastIndex-=l[0].length):O.regex.lastIndex=0,c++,c>=s.maxReplaces)break}}),n}},{key:"nest",value:function(n,r){var i=this,o=arguments.length>2&&arguments[2]!==void 0?arguments[2]:{},s,l,a;function c(v,y){var k=this.nestingOptionsSeparator;if(v.indexOf(k)<0)return v;var O=v.split(new RegExp("".concat(k,"[ ]*{"))),f="{".concat(O[1]);v=O[0],f=this.interpolate(f,a);var u=f.match(/'/g),g=f.match(/"/g);(u&&u.length%2===0&&!g||g.length%2!==0)&&(f=f.replace(/'/g,'"'));try{a=JSON.parse(f),y&&(a=nt(nt({},y),a))}catch(w){return this.logger.warn("failed parsing options string in nesting for key ".concat(v),w),"".concat(v).concat(k).concat(f)}return delete a.defaultValue,v}for(;s=this.nestingRegexp.exec(n);){var p=[];a=nt({},o),a=a.replace&&typeof a.replace!="string"?a.replace:a,a.applyPostProcessor=!1,delete a.defaultValue;var d=!1;if(s[0].indexOf(this.formatSeparator)!==-1&&!/{.*}/.test(s[1])){var h=s[1].split(this.formatSeparator).map(function(v){return v.trim()});s[1]=h.shift(),p=h,d=!0}if(l=r(c.call(this,s[1].trim(),a),a),l&&s[0]===n&&typeof l!="string")return l;typeof l!="string"&&(l=wa(l)),l||(this.logger.warn("missed to resolve ".concat(s[1]," for nesting ").concat(n)),l=""),d&&(l=p.reduce(function(v,y){return i.format(v,y,o.lng,nt(nt({},o),{},{interpolationkey:s[1].trim()}))},l.trim())),n=n.replace(s[0],l),this.regexp.lastIndex=0}return n}}]),e}();function ja(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter(function(i){return Object.getOwnPropertyDescriptor(e,i).enumerable})),n.push.apply(n,r)}return n}function Pt(e){for(var t=1;t-1){var r=e.split("(");t=r[0].toLowerCase().trim();var i=r[1].substring(0,r[1].length-1);if(t==="currency"&&i.indexOf(":")<0)n.currency||(n.currency=i.trim());else if(t==="relativetime"&&i.indexOf(":")<0)n.range||(n.range=i.trim());else{var o=i.split(";");o.forEach(function(s){if(s){var l=s.split(":"),a=dp(l),c=a[0],p=a.slice(1),d=p.join(":").trim().replace(/^'+|'+$/g,"");n[c.trim()]||(n[c.trim()]=d),d==="false"&&(n[c.trim()]=!1),d==="true"&&(n[c.trim()]=!0),isNaN(d)||(n[c.trim()]=parseInt(d,10))}})}}return{formatName:t,formatOptions:n}}function Pn(e){var t={};return function(r,i,o){var s=i+JSON.stringify(o),l=t[s];return l||(l=e(i,o),t[s]=l),l(r)}}var Tp=function(){function e(){var t=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{};ct(this,e),this.logger=vt.create("formatter"),this.options=t,this.formats={number:Pn(function(n,r){var i=new Intl.NumberFormat(n,Pt({},r));return function(o){return i.format(o)}}),currency:Pn(function(n,r){var i=new Intl.NumberFormat(n,Pt(Pt({},r),{},{style:"currency"}));return function(o){return i.format(o)}}),datetime:Pn(function(n,r){var i=new Intl.DateTimeFormat(n,Pt({},r));return function(o){return i.format(o)}}),relativetime:Pn(function(n,r){var i=new Intl.RelativeTimeFormat(n,Pt({},r));return function(o){return i.format(o,r.range||"day")}}),list:Pn(function(n,r){var i=new Intl.ListFormat(n,Pt({},r));return function(o){return i.format(o)}})},this.init(t)}return ft(e,[{key:"init",value:function(n){var r=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{interpolation:{}},i=r.interpolation;this.formatSeparator=i.formatSeparator?i.formatSeparator:i.formatSeparator||","}},{key:"add",value:function(n,r){this.formats[n.toLowerCase().trim()]=r}},{key:"addCached",value:function(n,r){this.formats[n.toLowerCase().trim()]=Pn(r)}},{key:"format",value:function(n,r,i){var o=this,s=arguments.length>3&&arguments[3]!==void 0?arguments[3]:{},l=r.split(this.formatSeparator),a=l.reduce(function(c,p){var d=Rp(p),h=d.formatName,v=d.formatOptions;if(o.formats[h]){var y=c;try{var k=s&&s.formatParams&&s.formatParams[s.interpolationkey]||{},O=k.locale||k.lng||s.locale||s.lng||i;y=o.formats[h](c,O,Pt(Pt(Pt({},v),s),k))}catch(f){o.logger.warn(f)}return y}else o.logger.warn("there was no format function for ".concat(h));return c},n);return a}}]),e}();function Ia(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter(function(i){return Object.getOwnPropertyDescriptor(e,i).enumerable})),n.push.apply(n,r)}return n}function La(e){for(var t=1;t"u"||!Reflect.construct||Reflect.construct.sham)return!1;if(typeof Proxy=="function")return!0;try{return Boolean.prototype.valueOf.call(Reflect.construct(Boolean,[],function(){})),!0}catch{return!1}}function Up(e,t){e.pending[t]!==void 0&&(delete e.pending[t],e.pendingCount--)}var Fp=function(e){ao(n,e);var t=Ap(n);function n(r,i,o){var s,l=arguments.length>3&&arguments[3]!==void 0?arguments[3]:{};return ct(this,n),s=t.call(this),uo&&Xt.call(Mt(s)),s.backend=r,s.store=i,s.services=o,s.languageUtils=o.languageUtils,s.options=l,s.logger=vt.create("backendConnector"),s.waitingReads=[],s.maxParallelReads=l.maxParallelReads||10,s.readingCalls=0,s.maxRetries=l.maxRetries>=0?l.maxRetries:5,s.retryTimeout=l.retryTimeout>=1?l.retryTimeout:350,s.state={},s.queue=[],s.backend&&s.backend.init&&s.backend.init(o,l.backend,l),s}return ft(n,[{key:"queueLoad",value:function(i,o,s,l){var a=this,c={},p={},d={},h={};return i.forEach(function(v){var y=!0;o.forEach(function(k){var O="".concat(v,"|").concat(k);!s.reload&&a.store.hasResourceBundle(v,k)?a.state[O]=2:a.state[O]<0||(a.state[O]===1?p[O]===void 0&&(p[O]=!0):(a.state[O]=1,y=!1,p[O]===void 0&&(p[O]=!0),c[O]===void 0&&(c[O]=!0),h[k]===void 0&&(h[k]=!0)))}),y||(d[v]=!0)}),(Object.keys(c).length||Object.keys(p).length)&&this.queue.push({pending:p,pendingCount:Object.keys(p).length,loaded:{},errors:[],callback:l}),{toLoad:Object.keys(c),pending:Object.keys(p),toLoadLanguages:Object.keys(d),toLoadNamespaces:Object.keys(h)}}},{key:"loaded",value:function(i,o,s){var l=i.split("|"),a=l[0],c=l[1];o&&this.emit("failedLoading",a,c,o),s&&this.store.addResourceBundle(a,c,s),this.state[i]=o?-1:2;var p={};this.queue.forEach(function(d){mp(d.loaded,[a],c),Up(d,i),o&&d.errors.push(o),d.pendingCount===0&&!d.done&&(Object.keys(d.loaded).forEach(function(h){p[h]||(p[h]={});var v=d.loaded[h];v.length&&v.forEach(function(y){p[h][y]===void 0&&(p[h][y]=!0)})}),d.done=!0,d.errors.length?d.callback(d.errors):d.callback())}),this.emit("loaded",p),this.queue=this.queue.filter(function(d){return!d.done})}},{key:"read",value:function(i,o,s){var l=this,a=arguments.length>3&&arguments[3]!==void 0?arguments[3]:0,c=arguments.length>4&&arguments[4]!==void 0?arguments[4]:this.retryTimeout,p=arguments.length>5?arguments[5]:void 0;if(!i.length)return p(null,{});if(this.readingCalls>=this.maxParallelReads){this.waitingReads.push({lng:i,ns:o,fcName:s,tried:a,wait:c,callback:p});return}this.readingCalls++;var d=function(k,O){if(l.readingCalls--,l.waitingReads.length>0){var f=l.waitingReads.shift();l.read(f.lng,f.ns,f.fcName,f.tried,f.wait,f.callback)}if(k&&O&&a2&&arguments[2]!==void 0?arguments[2]:{},a=arguments.length>3?arguments[3]:void 0;if(!this.backend)return this.logger.warn("No backend was added via i18next.use. Will not load resources."),a&&a();typeof i=="string"&&(i=this.languageUtils.toResolveHierarchy(i)),typeof o=="string"&&(o=[o]);var c=this.queueLoad(i,o,l,a);if(!c.toLoad.length)return c.pending.length||a(),null;c.toLoad.forEach(function(p){s.loadOne(p)})}},{key:"load",value:function(i,o,s){this.prepareLoading(i,o,{},s)}},{key:"reload",value:function(i,o,s){this.prepareLoading(i,o,{reload:!0},s)}},{key:"loadOne",value:function(i){var o=this,s=arguments.length>1&&arguments[1]!==void 0?arguments[1]:"",l=i.split("|"),a=l[0],c=l[1];this.read(a,c,"read",void 0,void 0,function(p,d){p&&o.logger.warn("".concat(s,"loading namespace ").concat(c," for language ").concat(a," failed"),p),!p&&d&&o.logger.log("".concat(s,"loaded namespace ").concat(c," for language ").concat(a),d),o.loaded(i,p,d)})}},{key:"saveMissing",value:function(i,o,s,l,a){var c=arguments.length>5&&arguments[5]!==void 0?arguments[5]:{},p=arguments.length>6&&arguments[6]!==void 0?arguments[6]:function(){};if(this.services.utils&&this.services.utils.hasLoadedNamespace&&!this.services.utils.hasLoadedNamespace(o)){this.logger.warn('did not save key "'.concat(s,'" as the namespace "').concat(o,'" was not yet loaded'),"This means something IS WRONG in your setup. You access the t function before i18next.init / i18next.loadNamespace / i18next.changeLanguage was done. Wait for the callback or Promise to resolve before accessing it!!!");return}if(!(s==null||s==="")){if(this.backend&&this.backend.create){var d=La(La({},c),{},{isUpdate:a}),h=this.backend.create.bind(this.backend);if(h.length<6)try{var v;h.length===5?v=h(i,o,s,l,d):v=h(i,o,s,l),v&&typeof v.then=="function"?v.then(function(y){return p(null,y)}).catch(p):p(null,v)}catch(y){p(y)}else h(i,o,s,l,p,d)}!i||!i[0]||this.store.addResource(i[0],o,s,l)}}}]),n}(Xt);function Ra(){return{debug:!1,initImmediate:!0,ns:["translation"],defaultNS:["translation"],fallbackLng:["dev"],fallbackNS:!1,supportedLngs:!1,nonExplicitSupportedLngs:!1,load:"all",preload:!1,simplifyPluralSuffix:!0,keySeparator:".",nsSeparator:":",pluralSeparator:"_",contextSeparator:"_",partialBundledLanguages:!1,saveMissing:!1,updateMissing:!1,saveMissingTo:"fallback",saveMissingPlurals:!0,missingKeyHandler:!1,missingInterpolationHandler:!1,postProcess:!1,postProcessPassResolved:!1,returnNull:!0,returnEmptyString:!0,returnObjects:!1,joinArrays:!1,returnedObjectHandler:!1,parseMissingKeyHandler:!1,appendNamespaceToMissingKey:!1,appendNamespaceToCIMode:!1,overloadTranslationOptionHandler:function(t){var n={};if(ue(t[1])==="object"&&(n=t[1]),typeof t[1]=="string"&&(n.defaultValue=t[1]),typeof t[2]=="string"&&(n.tDescription=t[2]),ue(t[2])==="object"||ue(t[3])==="object"){var r=t[3]||t[2];Object.keys(r).forEach(function(i){n[i]=r[i]})}return n},interpolation:{escapeValue:!0,format:function(t,n,r,i){return t},prefix:"{{",suffix:"}}",formatSeparator:",",unescapePrefix:"-",nestingPrefix:"$t(",nestingSuffix:")",nestingOptionsSeparator:",",maxReplaces:1e3,skipOnVariables:!0}}}function Ta(e){return typeof e.ns=="string"&&(e.ns=[e.ns]),typeof e.fallbackLng=="string"&&(e.fallbackLng=[e.fallbackLng]),typeof e.fallbackNS=="string"&&(e.fallbackNS=[e.fallbackNS]),e.supportedLngs&&e.supportedLngs.indexOf("cimode")<0&&(e.supportedLngs=e.supportedLngs.concat(["cimode"])),e}function Aa(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter(function(i){return Object.getOwnPropertyDescriptor(e,i).enumerable})),n.push.apply(n,r)}return n}function pt(e){for(var t=1;t"u"||!Reflect.construct||Reflect.construct.sham)return!1;if(typeof Proxy=="function")return!0;try{return Boolean.prototype.valueOf.call(Reflect.construct(Boolean,[],function(){})),!0}catch{return!1}}function ri(){}function bp(e){var t=Object.getOwnPropertyNames(Object.getPrototypeOf(e));t.forEach(function(n){typeof e[n]=="function"&&(e[n]=e[n].bind(e))})}var Di=function(e){ao(n,e);var t=zp(n);function n(){var r,i=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},o=arguments.length>1?arguments[1]:void 0;if(ct(this,n),r=t.call(this),uo&&Xt.call(Mt(r)),r.options=Ta(i),r.services={},r.logger=vt,r.modules={external:[]},bp(Mt(r)),o&&!r.isInitialized&&!i.isClone){if(!r.options.initImmediate)return r.init(i,o),Gr(r,Mt(r));setTimeout(function(){r.init(i,o)},0)}return r}return ft(n,[{key:"init",value:function(){var i=this,o=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},s=arguments.length>1?arguments[1]:void 0;typeof o=="function"&&(s=o,o={}),!o.defaultNS&&o.defaultNS!==!1&&o.ns&&(typeof o.ns=="string"?o.defaultNS=o.ns:o.ns.indexOf("translation")<0&&(o.defaultNS=o.ns[0]));var l=Ra();this.options=pt(pt(pt({},l),this.options),Ta(o)),this.options.compatibilityAPI!=="v1"&&(this.options.interpolation=pt(pt({},l.interpolation),this.options.interpolation)),o.keySeparator!==void 0&&(this.options.userDefinedKeySeparator=o.keySeparator),o.nsSeparator!==void 0&&(this.options.userDefinedNsSeparator=o.nsSeparator);function a(f){return f?typeof f=="function"?new f:f:null}if(!this.options.isClone){this.modules.logger?vt.init(a(this.modules.logger),this.options):vt.init(null,this.options);var c;this.modules.formatter?c=this.modules.formatter:typeof Intl<"u"&&(c=Tp);var p=new _a(this.options);this.store=new Op(this.options.resources,this.options);var d=this.services;d.logger=vt,d.resourceStore=this.store,d.languageUtils=p,d.pluralResolver=new Ip(p,{prepend:this.options.pluralSeparator,compatibilityJSON:this.options.compatibilityJSON,simplifyPluralSuffix:this.options.simplifyPluralSuffix}),c&&(!this.options.interpolation.format||this.options.interpolation.format===l.interpolation.format)&&(d.formatter=a(c),d.formatter.init(d,this.options),this.options.interpolation.format=d.formatter.format.bind(d.formatter)),d.interpolator=new Lp(this.options),d.utils={hasLoadedNamespace:this.hasLoadedNamespace.bind(this)},d.backendConnector=new Fp(a(this.modules.backend),d.resourceStore,d,this.options),d.backendConnector.on("*",function(f){for(var u=arguments.length,g=new Array(u>1?u-1:0),w=1;w1?u-1:0),w=1;w0&&h[0]!=="dev"&&(this.options.lng=h[0])}!this.services.languageDetector&&!this.options.lng&&this.logger.warn("init: no languageDetector is used and no lng is defined");var v=["getResource","hasResourceBundle","getResourceBundle","getDataByLanguage"];v.forEach(function(f){i[f]=function(){var u;return(u=i.store)[f].apply(u,arguments)}});var y=["addResource","addResources","addResourceBundle","removeResourceBundle"];y.forEach(function(f){i[f]=function(){var u;return(u=i.store)[f].apply(u,arguments),i}});var k=or(),O=function(){var u=function(w,x){i.isInitialized&&!i.initializedStoreOnce&&i.logger.warn("init: i18next is already initialized. You should call init just once!"),i.isInitialized=!0,i.options.isClone||i.logger.log("initialized",i.options),i.emit("initialized",i.options),k.resolve(x),s(w,x)};if(i.languages&&i.options.compatibilityAPI!=="v1"&&!i.isInitialized)return u(null,i.t.bind(i));i.changeLanguage(i.options.lng,u)};return this.options.resources||!this.options.initImmediate?O():setTimeout(O,0),k}},{key:"loadResources",value:function(i){var o=this,s=arguments.length>1&&arguments[1]!==void 0?arguments[1]:ri,l=s,a=typeof i=="string"?i:this.language;if(typeof i=="function"&&(l=i),!this.options.resources||this.options.partialBundledLanguages){if(a&&a.toLowerCase()==="cimode")return l();var c=[],p=function(v){if(v){var y=o.services.languageUtils.toResolveHierarchy(v);y.forEach(function(k){c.indexOf(k)<0&&c.push(k)})}};if(a)p(a);else{var d=this.services.languageUtils.getFallbackCodes(this.options.fallbackLng);d.forEach(function(h){return p(h)})}this.options.preload&&this.options.preload.forEach(function(h){return p(h)}),this.services.backendConnector.load(c,this.options.ns,function(h){!h&&!o.resolvedLanguage&&o.language&&o.setResolvedLanguage(o.language),l(h)})}else l(null)}},{key:"reloadResources",value:function(i,o,s){var l=or();return i||(i=this.languages),o||(o=this.options.ns),s||(s=ri),this.services.backendConnector.reload(i,o,function(a){l.resolve(),s(a)}),l}},{key:"use",value:function(i){if(!i)throw new Error("You are passing an undefined module! Please check the object you are passing to i18next.use()");if(!i.type)throw new Error("You are passing a wrong module! Please check the object you are passing to i18next.use()");return i.type==="backend"&&(this.modules.backend=i),(i.type==="logger"||i.log&&i.warn&&i.error)&&(this.modules.logger=i),i.type==="languageDetector"&&(this.modules.languageDetector=i),i.type==="i18nFormat"&&(this.modules.i18nFormat=i),i.type==="postProcessor"&&mc.addPostProcessor(i),i.type==="formatter"&&(this.modules.formatter=i),i.type==="3rdParty"&&this.modules.external.push(i),this}},{key:"setResolvedLanguage",value:function(i){if(!(!i||!this.languages)&&!(["cimode","dev"].indexOf(i)>-1))for(var o=0;o-1)&&this.store.hasLanguageSomeTranslations(s)){this.resolvedLanguage=s;break}}}},{key:"changeLanguage",value:function(i,o){var s=this;this.isLanguageChangingTo=i;var l=or();this.emit("languageChanging",i);var a=function(h){s.language=h,s.languages=s.services.languageUtils.toResolveHierarchy(h),s.resolvedLanguage=void 0,s.setResolvedLanguage(h)},c=function(h,v){v?(a(v),s.translator.changeLanguage(v),s.isLanguageChangingTo=void 0,s.emit("languageChanged",v),s.logger.log("languageChanged",v)):s.isLanguageChangingTo=void 0,l.resolve(function(){return s.t.apply(s,arguments)}),o&&o(h,function(){return s.t.apply(s,arguments)})},p=function(h){!i&&!h&&s.services.languageDetector&&(h=[]);var v=typeof h=="string"?h:s.services.languageUtils.getBestMatchFromCodes(h);v&&(s.language||a(v),s.translator.language||s.translator.changeLanguage(v),s.services.languageDetector&&s.services.languageDetector.cacheUserLanguage&&s.services.languageDetector.cacheUserLanguage(v)),s.loadResources(v,function(y){c(y,v)})};return!i&&this.services.languageDetector&&!this.services.languageDetector.async?p(this.services.languageDetector.detect()):!i&&this.services.languageDetector&&this.services.languageDetector.async?this.services.languageDetector.detect.length===0?this.services.languageDetector.detect().then(p):this.services.languageDetector.detect(p):p(i),l}},{key:"getFixedT",value:function(i,o,s){var l=this,a=function c(p,d){var h;if(ue(d)!=="object"){for(var v=arguments.length,y=new Array(v>2?v-2:0),k=2;k1&&arguments[1]!==void 0?arguments[1]:{};if(!this.isInitialized)return this.logger.warn("hasLoadedNamespace: i18next was not initialized",this.languages),!1;if(!this.languages||!this.languages.length)return this.logger.warn("hasLoadedNamespace: i18n.languages were undefined or empty",this.languages),!1;var l=this.resolvedLanguage||this.languages[0],a=this.options?this.options.fallbackLng:!1,c=this.languages[this.languages.length-1];if(l.toLowerCase()==="cimode")return!0;var p=function(v,y){var k=o.services.backendConnector.state["".concat(v,"|").concat(y)];return k===-1||k===2};if(s.precheck){var d=s.precheck(this,p);if(d!==void 0)return d}return!!(this.hasResourceBundle(l,i)||!this.services.backendConnector.backend||this.options.resources&&!this.options.partialBundledLanguages||p(l,i)&&(!a||p(c,i)))}},{key:"loadNamespaces",value:function(i,o){var s=this,l=or();return this.options.ns?(typeof i=="string"&&(i=[i]),i.forEach(function(a){s.options.ns.indexOf(a)<0&&s.options.ns.push(a)}),this.loadResources(function(a){l.resolve(),o&&o(a)}),l):(o&&o(),Promise.resolve())}},{key:"loadLanguages",value:function(i,o){var s=or();typeof i=="string"&&(i=[i]);var l=this.options.preload||[],a=i.filter(function(c){return l.indexOf(c)<0});return a.length?(this.options.preload=l.concat(a),this.loadResources(function(c){s.resolve(),o&&o(c)}),s):(o&&o(),Promise.resolve())}},{key:"dir",value:function(i){if(i||(i=this.resolvedLanguage||(this.languages&&this.languages.length>0?this.languages[0]:this.language)),!i)return"rtl";var o=["ar","shu","sqr","ssh","xaa","yhd","yud","aao","abh","abv","acm","acq","acw","acx","acy","adf","ads","aeb","aec","afb","ajp","apc","apd","arb","arq","ars","ary","arz","auz","avl","ayh","ayl","ayn","ayp","bbz","pga","he","iw","ps","pbt","pbu","pst","prp","prd","ug","ur","ydd","yds","yih","ji","yi","hbo","men","xmn","fa","jpr","peo","pes","prs","dv","sam","ckb"],s=this.services&&this.services.languageUtils||new _a(Ra());return o.indexOf(s.getLanguagePartFromCode(i))>-1||i.toLowerCase().indexOf("-arab")>1?"rtl":"ltr"}},{key:"cloneInstance",value:function(){var i=this,o=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},s=arguments.length>1&&arguments[1]!==void 0?arguments[1]:ri,l=pt(pt(pt({},this.options),o),{isClone:!0}),a=new n(l);(o.debug!==void 0||o.prefix!==void 0)&&(a.logger=a.logger.clone(o));var c=["store","services","language"];return c.forEach(function(p){a[p]=i[p]}),a.services=pt({},this.services),a.services.utils={hasLoadedNamespace:a.hasLoadedNamespace.bind(a)},a.translator=new Na(a.services,a.options),a.translator.on("*",function(p){for(var d=arguments.length,h=new Array(d>1?d-1:0),v=1;v0&&arguments[0]!==void 0?arguments[0]:{},t=arguments.length>1?arguments[1]:void 0;return new Di(e,t)});var le=Di.createInstance();le.createInstance=Di.createInstance;le.createInstance;le.dir;le.init;le.loadResources;le.reloadResources;le.use;le.changeLanguage;le.getFixedT;le.t;le.exists;le.setDefaultNamespace;le.hasLoadedNamespace;le.loadNamespaces;le.loadLanguages;function $p(e,t){if(e==null)return{};var n={},r=Object.keys(e),i,o;for(o=0;o=0)&&(n[i]=e[i]);return n}function dl(e,t){if(e==null)return{};var n=$p(e,t),r,i;if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(i=0;i=0)&&Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var Mp={area:!0,base:!0,br:!0,col:!0,embed:!0,hr:!0,img:!0,input:!0,link:!0,meta:!0,param:!0,source:!0,track:!0,wbr:!0};const Bp=qu(Mp);var Kp=/\s([^'"/\s><]+?)[\s/>]|([^\s=]+)=\s?(".*?"|'.*?')/g;function Da(e){var t={type:"tag",name:"",voidElement:!1,attrs:{},children:[]},n=e.match(/<\/?([^\s]+?)[/\s>]/);if(n&&(t.name=n[1],(Bp[n[1]]||e.charAt(e.length-2)==="/")&&(t.voidElement=!0),t.name.startsWith("!--"))){var r=e.indexOf("-->");return{type:"comment",comment:r!==-1?e.slice(4,r):""}}for(var i=new RegExp(Kp),o=null;(o=i.exec(e))!==null;)if(o[0].trim())if(o[1]){var s=o[1].trim(),l=[s,""];s.indexOf("=")>-1&&(l=s.split("=")),t.attrs[l[0]]=l[1],i.lastIndex--}else o[2]&&(t.attrs[o[2]]=o[3].trim().substring(1,o[3].length-1));return t}var Hp=/<[a-zA-Z0-9\-\!\/](?:"[^"]*"|'[^']*'|[^'">])*>/g,Gp=/^\s*$/,Wp=Object.create(null);function vc(e,t){switch(t.type){case"text":return e+t.content;case"tag":return e+="<"+t.name+(t.attrs?function(n){var r=[];for(var i in n)r.push(i+'="'+n[i]+'"');return r.length?" "+r.join(" "):""}(t.attrs):"")+(t.voidElement?"/>":">"),t.voidElement?e:e+t.children.reduce(vc,"")+"";case"comment":return e+""}}var Yp={parse:function(e,t){t||(t={}),t.components||(t.components=Wp);var n,r=[],i=[],o=-1,s=!1;if(e.indexOf("<")!==0){var l=e.indexOf("<");r.push({type:"text",content:l===-1?e:e.substring(0,l)})}return e.replace(Hp,function(a,c){if(s){if(a!=="")return;s=!1}var p,d=a.charAt(1)!=="/",h=a.startsWith("");return{type:"comment",comment:r!==-1?e.slice(4,r):""}}for(var i=new RegExp(Zp),o=null;(o=i.exec(e))!==null;)if(o[0].trim())if(o[1]){var s=o[1].trim(),l=[s,""];s.indexOf("=")>-1&&(l=s.split("=")),t.attrs[l[0]]=l[1],i.lastIndex--}else o[2]&&(t.attrs[o[2]]=o[3].trim().substring(1,o[3].length-1));return t}var qp=/<[a-zA-Z0-9\-\!\/](?:"[^"]*"|'[^']*'|[^'">])*>/g,eh=/^\s*$/,th=Object.create(null);function _c(e,t){switch(t.type){case"text":return e+t.content;case"tag":return e+="<"+t.name+(t.attrs?function(n){var r=[];for(var i in n)r.push(i+'="'+n[i]+'"');return r.length?" "+r.join(" "):""}(t.attrs):"")+(t.voidElement?"/>":">"),t.voidElement?e:e+t.children.reduce(_c,"")+"";case"comment":return e+""}}var nh={parse:function(e,t){t||(t={}),t.components||(t.components=th);var n,r=[],i=[],o=-1,s=!1;if(e.indexOf("<")!==0){var l=e.indexOf("<");r.push({type:"text",content:l===-1?e:e.substring(0,l)})}return e.replace(qp,function(a,u){if(s){if(a!=="")return;s=!1}var p,d=a.charAt(1)!=="/",h=a.startsWith("| Result | -> Model output names: N1 +/// |----------------| |-----------------| +/// +/// +/// Examples 2: Result's has got specific names +/// +/// set output names: set output names: +/// [N1] [R1, R2] +/// ↓ ↓ +/// |----------------| [names: N1, R1, R2] |-----------------| +/// | Node |--------------------------->| Result | -> Model output names: R1, R2 +/// |----------------| |-----------------| +/// +/// +/// Examples 3: Result from example 2 connected to new node +/// +/// set output names: set output names: +/// [N2] [R1, R2] +/// ↓ ↓ +/// |----------------| [names: N2, R1, R2] |-----------------| +/// | Node |--------------------------->| Result | -> Model output names: R1, R2 +/// |----------------| |-----------------| +/// +/// set output names: +/// [N1] +/// ↓ +/// |----------------| [names: N1] +/// | Node |-----------------> +/// |----------------| +/// class OPENVINO_API Result : public Op { public: OPENVINO_OP("Result", "opset1"); diff --git a/src/core/include/openvino/pass/matcher_pass.hpp b/src/core/include/openvino/pass/matcher_pass.hpp index b17237fdf08340..e98f5ff89008fd 100644 --- a/src/core/include/openvino/pass/matcher_pass.hpp +++ b/src/core/include/openvino/pass/matcher_pass.hpp @@ -6,10 +6,22 @@ #include #include -#include +#include +#include +#include "openvino/core/rtti.hpp" #include "openvino/pass/node_registry.hpp" +#define _OPENVINO_MATCHER_PASS_RTTI_WITH_TYPE(TYPE_NAME) _OPENVINO_MATCHER_PASS_RTTI_WITH_TYPE_VERSION(TYPE_NAME, "0") + +#define _OPENVINO_MATCHER_PASS_RTTI_WITH_TYPE_VERSION(TYPE_NAME, VERSION_NAME) \ + _OPENVINO_RTTI_WITH_TYPE_VERSION_PARENT(TYPE_NAME, VERSION_NAME, ::ov::pass::MatcherPass) + +#define OPENVINO_MATCHER_PASS_RTTI(...) \ + _OPENVINO_RTTI_EXPAND(_OPENVINO_RTTI_DEFINITION_SELECTOR_2(__VA_ARGS__, \ + _OPENVINO_MATCHER_PASS_RTTI_WITH_TYPE_VERSION, \ + _OPENVINO_MATCHER_PASS_RTTI_WITH_TYPE)(__VA_ARGS__)) + namespace ov { using matcher_pass_callback = std::function; using graph_rewrite_callback = std::function; diff --git a/src/core/include/openvino/pass/pattern/matcher.hpp b/src/core/include/openvino/pass/pattern/matcher.hpp index bbd7e32b0a1802..7112ac9ff85e64 100644 --- a/src/core/include/openvino/pass/pattern/matcher.hpp +++ b/src/core/include/openvino/pass/pattern/matcher.hpp @@ -62,10 +62,31 @@ class OPENVINO_API Matcher { // Avoid implicit string construction from nullptr. Matcher(const std::shared_ptr pattern_node, std::nullptr_t name) = delete; - Matcher() = default; - Matcher(Output& pattern_node) : m_pattern_node{pattern_node} {} - - Matcher(Output& pattern_node, const std::string& name) : m_pattern_node(pattern_node), m_name{name} {} + Matcher() + : m_match_root{}, + m_pattern_node{}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{""}, + m_strict_mode{false} {} + Matcher(Output& pattern_node) + : m_match_root{}, + m_pattern_node{pattern_node}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{""}, + m_strict_mode{false} {} + + Matcher(Output& pattern_node, const std::string& name) + : m_match_root{}, + m_pattern_node{pattern_node}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{name}, + m_strict_mode{false} {} /// \brief Constructs a Matcher object /// @@ -73,9 +94,13 @@ class OPENVINO_API Matcher { /// \param name is a string which is used for logging and disabling a matcher /// \param strict_mode forces a matcher to consider shapes and ET of nodes Matcher(const Output& pattern_node, const std::string& name, bool strict_mode) - : m_pattern_node(pattern_node), - m_name(name), - m_strict_mode(strict_mode) {} + : m_match_root{}, + m_pattern_node{pattern_node}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{name}, + m_strict_mode{strict_mode} {} // Some matches should start on a node rather than an output. These three constructors // are transition until we work out the right way to do that. diff --git a/src/core/include/openvino/pass/serialize.hpp b/src/core/include/openvino/pass/serialize.hpp index fc3e743d4005dc..d0eaadde346bf6 100644 --- a/src/core/include/openvino/pass/serialize.hpp +++ b/src/core/include/openvino/pass/serialize.hpp @@ -11,6 +11,10 @@ #include "openvino/opsets/opset.hpp" #include "openvino/pass/pass.hpp" +#ifdef OPENVINO_CPP_VER_17 +# include +#endif + namespace ov { namespace pass { @@ -35,6 +39,13 @@ class OPENVINO_API Serialize : public ov::pass::ModelPass { Serialize(const std::string& xmlPath, const std::string& binPath, Version version = Version::UNSPECIFIED); +#ifdef OPENVINO_CPP_VER_17 + Serialize(const std::filesystem::path& xmlPath, + const std::filesystem::path& binPath, + Version version = Version::UNSPECIFIED) + : Serialize(xmlPath.string(), binPath.string(), version) {} +#endif + private: std::ostream* m_xmlFile; std::ostream* m_binFile; diff --git a/src/core/reference/include/openvino/reference/reduce_mean.hpp b/src/core/reference/include/openvino/reference/reduce_mean.hpp index 4c46d4ca786d09..f046f4f96197bb 100644 --- a/src/core/reference/include/openvino/reference/reduce_mean.hpp +++ b/src/core/reference/include/openvino/reference/reduce_mean.hpp @@ -26,6 +26,10 @@ void reduce_mean(const T* in, T* out, const Shape& in_shape, const AxisSet& redu reduce_sum(in, out, in_shape, reduction_axes); const auto out_shape = util::reduce(in_shape, reduction_axes); + if (shape_size(in_shape) == 0) { + return; + } + const auto out_size = shape_size(out_shape); const auto count = static_cast(shape_size(in_shape) / out_size); std::transform(out, std::next(out, out_size), out, [count](const T value) { diff --git a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp index 62dfe01ec4ef1d..4861ef4f7d999d 100644 --- a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp +++ b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp @@ -64,7 +64,13 @@ class RegistersPool { } void release() { if (auto pool = regPool.lock()) { - pool->return_to_pool(reg); + try { + pool->return_to_pool(reg); + } catch (...) { + // This function is called by destructor and should not throw. Well formed Reg object won't cause + // any exception throw from return_to_pool, while on badly formed object the destructor is most + // likely called during exception stack unwind. + } regPool.reset(); } } @@ -90,8 +96,10 @@ class RegistersPool { RegistersPool::WeakPtr regPool; }; + static thread_local bool is_created; + virtual ~RegistersPool() { - check_unique_and_update(false); + is_created = false; } template @@ -178,7 +186,7 @@ class RegistersPool { } } - void check_unique_and_update(bool isCtor = true); + void check_unique_and_update(); PhysicalSet m_general_set; PhysicalSet m_simd_set; diff --git a/src/core/reference/src/utils/registers_pool.cpp b/src/core/reference/src/utils/registers_pool.cpp index 413fdcc3ed83cf..a1e6462aa51a36 100644 --- a/src/core/reference/src/utils/registers_pool.cpp +++ b/src/core/reference/src/utils/registers_pool.cpp @@ -34,16 +34,12 @@ RegistersPool::RegistersPool(std::initializer_list regsToExclude, in m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RSP)); } -void RegistersPool::check_unique_and_update(bool is_ctor) { - static thread_local bool is_created = false; - if (is_ctor) { - if (is_created) { - OPENVINO_THROW("There should be only one instance of RegistersPool per thread"); - } - is_created = true; - } else { - is_created = false; - } +thread_local bool RegistersPool::is_created = false; + +void RegistersPool::check_unique_and_update() { + OPENVINO_ASSERT(!is_created, "There should be only one instance of RegistersPool per thread"); + + is_created = true; } void RegistersPool::PhysicalSet::set_as_used(size_t reg_idx) { diff --git a/src/core/shape_inference/include/glu_shape_inference.hpp b/src/core/shape_inference/include/glu_shape_inference.hpp new file mode 100644 index 00000000000000..365b57244036a2 --- /dev/null +++ b/src/core/shape_inference/include/glu_shape_inference.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ov_ops/glu.hpp" +#include "utils.hpp" +#include "variadic_split_shape_inference.hpp" + +namespace ov { +namespace op { +namespace internal { +template > +std::vector shape_infer(const GLU* op, const std::vector& input_shapes) { + const auto inputs_count = input_shapes.size(); + NODE_SHAPE_INFER_CHECK(op, input_shapes, inputs_count == 1); + + int64_t axis = op->get_axis(); + std::vector split_lengths = {op->get_split_lengths(), -1}; + std::unordered_map const_data; + const_data.emplace(1, ov::Tensor(ov::element::i64, ov::Shape{}, &axis)); + const_data.emplace(2, ov::Tensor(ov::element::i64, ov::Shape{split_lengths.size()}, split_lengths.data())); + + const ov::Shape split_len_size{split_lengths.size()}; + const ov::Shape scalar{}; + std::vector variadic_split_input_shapes{input_shapes[0], scalar, split_len_size}; + + return {std::move( + ov::op::variadic_split::shape_infer(op, variadic_split_input_shapes, ov::make_tensor_accessor(const_data))[0])}; +} +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/core/shape_inference/include/ov_optional.hpp b/src/core/shape_inference/include/ov_optional.hpp index f7f8b474f9a5a6..15973ae0c8a5f8 100644 --- a/src/core/shape_inference/include/ov_optional.hpp +++ b/src/core/shape_inference/include/ov_optional.hpp @@ -7,6 +7,9 @@ #include namespace ov { +#ifdef OPENVINO_CPP_17_VER +using optional = std::optional; +#else /** * @brief Store optional object of type T (basic version of std::optional). @@ -132,4 +135,5 @@ class optional { bool m_has_value = false; Storage m_opt{}; }; +#endif } // namespace ov diff --git a/src/core/shape_inference/include/variadic_split_shape_inference.hpp b/src/core/shape_inference/include/variadic_split_shape_inference.hpp index a0eff51f238e61..e0cd837003a331 100644 --- a/src/core/shape_inference/include/variadic_split_shape_inference.hpp +++ b/src/core/shape_inference/include/variadic_split_shape_inference.hpp @@ -10,10 +10,9 @@ namespace ov { namespace op { -namespace v1 { - +namespace variadic_split { template > -std::vector shape_infer(const VariadicSplit* op, +std::vector shape_infer(const Node* op, const std::vector& input_shapes, const ITensorAccessor& ta = make_tensor_accessor()) { constexpr bool is_dynamic_shape = std::is_base_of::value; @@ -120,6 +119,15 @@ std::vector shape_infer(const VariadicSplit* op, } return output_shapes; } +} // namespace variadic_split + +namespace v1 { +template > +std::vector shape_infer(const VariadicSplit* op, + const std::vector& input_shapes, + const ITensorAccessor& ta = make_tensor_accessor()) { + return op::variadic_split::shape_infer(op, input_shapes, ta); +} } // namespace v1 } // namespace op diff --git a/src/core/src/any.cpp b/src/core/src/any.cpp index 82dc01c99377fd..346819eced93e5 100644 --- a/src/core/src/any.cpp +++ b/src/core/src/any.cpp @@ -6,6 +6,17 @@ #include #include +namespace { +template +bool contains_type_index(Container&& types, const std::type_info& user_type) { + for (auto&& type : types) { + if (ov::util::equal(type, user_type)) { + return true; + } + } + return false; +} +} // namespace namespace ov { @@ -68,6 +79,48 @@ void Any::Base::read_to(Base& other) const { } } +bool Any::Base::is_base_type_info(const std::type_info& user_type) const { + return contains_type_index(base_type_info(), user_type); +} + +bool Any::Base::is_signed_integral() const { + return std::is_signed::value ? contains_type_index(std::initializer_list{typeid(char), + typeid(signed char), + typeid(short), + typeid(int), + typeid(long), + typeid(long long)}, + type_info()) + : contains_type_index(std::initializer_list{typeid(signed char), + typeid(short), + typeid(int), + typeid(long), + typeid(long long)}, + type_info()); +} + +bool Any::Base::is_unsigned_integral() const { + return std::is_signed::value + ? contains_type_index(std::initializer_list{typeid(unsigned char), + typeid(unsigned short), + typeid(unsigned int), + typeid(unsigned long), + typeid(unsigned long long)}, + type_info()) + : contains_type_index(std::initializer_list{typeid(char), + typeid(unsigned char), + typeid(unsigned short), + typeid(unsigned int), + typeid(unsigned long), + typeid(unsigned long long)}, + type_info()); +} +bool Any::Base::is_floating_point() const { + return contains_type_index( + std::initializer_list{typeid(float), typeid(double), typeid(long double)}, + type_info()); +} + Any::~Any() { _temp = {}; _impl = {}; @@ -293,4 +346,42 @@ void Write::operator()(std::ostream& os, const Any& any) const { } } // namespace util + +template +[[noreturn]] U Any::Base::convert_impl() const { + OPENVINO_THROW("Bad cast from: ", type_info().name(), " to: ", typeid(U).name()); +} + +template +U Any::Base::convert_impl() const { + return is() ? static_cast(as()) : convert_impl(); +} + +template <> +long long Any::Base::convert() const { + return std::is_signed::value ? convert_impl() + : convert_impl(); +} + +template <> +unsigned long long Any::Base::convert() const { + return std::is_signed::value ? convert_impl() + : convert_impl(); +} + +template <> +double Any::Base::convert() const { + return convert_impl(); +} } // namespace ov diff --git a/src/core/src/descriptor/input.cpp b/src/core/src/descriptor/input.cpp index 544abd10945806..28288002780739 100644 --- a/src/core/src/descriptor/input.cpp +++ b/src/core/src/descriptor/input.cpp @@ -71,14 +71,6 @@ ov::descriptor::Tensor& ov::descriptor::Input::get_tensor() { return m_output->get_tensor(); } -std::shared_ptr ov::descriptor::Input::get_tensor_ptr() const { - return m_output->get_tensor_ptr(); -} - -std::shared_ptr ov::descriptor::Input::get_tensor_ptr() { - return m_output->get_tensor_ptr(); -} - const ov::Shape& ov::descriptor::Input::get_shape() const { return m_output->get_shape(); } diff --git a/src/core/src/descriptor/shared_tensor.cpp b/src/core/src/descriptor/shared_tensor.cpp new file mode 100644 index 00000000000000..314aa524bcacec --- /dev/null +++ b/src/core/src/descriptor/shared_tensor.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/core/descriptor/output.hpp" +#include "openvino/core/descriptor_tensor.hpp" +#include "openvino/util/common_util.hpp" + +namespace ov { +namespace descriptor { +/** + * @brief Dedicated tensor descriptor implementation to share input descriptor. + * + * Shared tensor share input tensor but have specific properties: + * - tensor names - if set these are used as descriptor names and appended to input tensor because is same tensor + */ +class SharedTensor : public ITensorDescriptor { +public: + SharedTensor(std::shared_ptr tensor) + : m_shared_tensor{std::move(tensor)}, + m_output_names{}, + m_name_it{} { + OPENVINO_ASSERT(m_shared_tensor, "Cannot set NULL tensor descriptor"); + } + + // --- ITensorDescriptor API + virtual const element::Type& get_element_type() const override { + return m_shared_tensor->get_element_type(); + } + + virtual const PartialShape& get_partial_shape() const override { + return m_shared_tensor->get_partial_shape(); + } + + virtual const Shape& get_shape() const override { + return m_shared_tensor->get_shape(); + } + + virtual void set_type_shape(const element::Type& et, const PartialShape& shape) override { + m_shared_tensor->set_type_shape(et, shape); + } + + void set_names(const std::unordered_set& names) override { + rm_tensor_output_names(); + m_output_names = names; + m_name_it = std::min_element(m_output_names.begin(), m_output_names.end()); + m_shared_tensor->add_names(m_output_names); + } + + void add_names(const std::unordered_set& names) override { + m_output_names.insert(names.begin(), names.end()); + m_name_it = std::min_element(m_output_names.begin(), m_output_names.end()); + m_shared_tensor->add_names(names); + } + + const std::unordered_set& get_names() const override { + return m_output_names.empty() ? m_shared_tensor->get_names() : m_output_names; + } + + const std::unordered_set& get_all_names() const override { + return m_shared_tensor->get_names(); + } + + const std::string& get_any_name() const override { + return m_output_names.empty() ? m_shared_tensor->get_any_name() : *m_name_it; + } + + const RTMap& rt_map() const override { + return m_shared_tensor->rt_map(); + } + + RTMap& rt_map() override { + return m_shared_tensor->rt_map(); + } + + size_t pointer_hash() const noexcept override { + return m_shared_tensor->pointer_hash(); + } + + // --- SharedTensor specific interface + void set_tensor(std::shared_ptr tensor) { + if (tensor != m_shared_tensor) { + OPENVINO_ASSERT(tensor, "Cannot set NULL tensor descriptor"); + rm_tensor_output_names(); + auto prev_rt_map = rt_map(); + + m_shared_tensor = std::move(tensor); + m_shared_tensor->add_names(m_output_names); + rt_map().insert(std::make_move_iterator(prev_rt_map.begin()), std::make_move_iterator(prev_rt_map.end())); + } + } + +private: + void rm_tensor_output_names() { + auto names = m_shared_tensor->get_names(); + for (const auto& output_name : m_output_names) { + names.erase(output_name); + } + + m_shared_tensor->set_names(names); + } + + std::shared_ptr m_shared_tensor; + std::unordered_set m_output_names; + std::unordered_set::const_iterator m_name_it; +}; + +/** + * @brief Set output tensor descriptor with shared tensor from new input. + * + * @param output Output descriptor to be updated. + * @param input Input descriptor to set as shared tensor. + */ +void set_shared_tensor(Output& output, const Input& input) { + auto& output_descriptor = TensorExtension::get_descriptor_ptr(output.get_tensor()); + const auto& input_descriptor = TensorExtension::get_descriptor_ptr(input.get_output().get_tensor()); + if (auto* result_ptr = dynamic_cast(output_descriptor.get())) { + result_ptr->set_tensor(input_descriptor); + } else { + output_descriptor = std::make_shared(input_descriptor); + } +} + +} // namespace descriptor +} // namespace ov diff --git a/src/core/src/descriptor/tensor.cpp b/src/core/src/descriptor/tensor.cpp index ae3f7c6e77cd4f..6e85b25b2b9f8a 100644 --- a/src/core/src/descriptor/tensor.cpp +++ b/src/core/src/descriptor/tensor.cpp @@ -8,26 +8,153 @@ #include "openvino/core/descriptor_tensor.hpp" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" +#include "openvino/core/type/element_iterator.hpp" #include "openvino/op/util/symbolic_info.hpp" +#include "openvino/util/common_util.hpp" -ov::descriptor::Tensor::Tensor(const element::Type& element_type, - const PartialShape& pshape, - const std::unordered_set& names) - : m_element_type(element_type), - m_partial_shape(pshape) { - set_names(names); +namespace ov { +namespace descriptor { + +/** @brief Helper class to store Tensor shape information.*/ +class ShapeInfo { +public: + ShapeInfo() = default; + ShapeInfo(const PartialShape& shape) : m_partial_shape{shape} {} + + void set_partial_shape(PartialShape shape) { + AtomicGuard lock(m_shape_changing); + m_partial_shape = std::move(shape); + m_shape_changed = true; + } + + const PartialShape& get_partial_shape() const { + return m_partial_shape; + } + + const Shape& get_shape() const { + AtomicGuard lock(m_shape_changing); + if (m_shape_changed) { + m_shape = m_partial_shape.to_shape(); + m_shape_changed = false; + } + return m_shape; + } + +private: + PartialShape m_partial_shape{}; + mutable Shape m_shape{}; + mutable std::atomic m_shape_changing{false}; + mutable bool m_shape_changed{true}; +}; + +// --- Tensor descriptor interface +ITensorDescriptor::~ITensorDescriptor() = default; + +/** @brief Basic tensor descriptor. */ +class BasicTensor : public ITensorDescriptor { +public: + BasicTensor() = default; + + BasicTensor(const element::Type& et, const PartialShape& shape, const std::unordered_set& names) + : m_element_type{et}, + m_shape_info{shape}, + m_names{names}, + m_name_it{find_new_any_name(m_names)}, + m_rt_map{}, + m_legacy_name{} {} + + virtual const element::Type& get_element_type() const override { + return m_element_type; + } + + virtual const PartialShape& get_partial_shape() const override { + return m_shape_info.get_partial_shape(); + } + + virtual const Shape& get_shape() const override { + return m_shape_info.get_shape(); + } + + virtual void set_type_shape(const element::Type& et, const PartialShape& shape) override { + m_element_type = et; + m_shape_info.set_partial_shape(shape); + } + + void set_names(const std::unordered_set& names) override { + m_names = names; + m_name_it = find_new_any_name(m_names); + }; + + void add_names(const std::unordered_set& names) override { + m_names.insert(names.begin(), names.end()); + m_name_it = find_new_any_name(m_names); + } + + const std::unordered_set& get_names() const override { + return m_names; + } + + const std::unordered_set& get_all_names() const override { + return get_names(); + } + + const std::string& get_any_name() const override { + OPENVINO_ASSERT(!get_names().empty(), "Attempt to get a name for a Tensor without names"); + return *m_name_it; + } + + const RTMap& rt_map() const override { + return m_rt_map; + } + + RTMap& rt_map() override { + return m_rt_map; + }; + + size_t pointer_hash() const noexcept override { + return std::hash()(this); + } + +private: + element::Type m_element_type; + ShapeInfo m_shape_info; + std::unordered_set m_names; + std::unordered_set::const_iterator m_name_it; + RTMap m_rt_map; + std::string m_legacy_name; + + static decltype(m_name_it) find_new_any_name(const decltype(m_names)& names) { + return std::min_element(names.begin(), names.end()); + } +}; + +// --- TensorExtension +const ITensorDescriptor& TensorExtension::get_descriptor(const Tensor& tensor) { + return *tensor.m_impl; } -ov::descriptor::Tensor::Tensor(const element::Type& element_type, - const PartialShape& pshape, - ov::Node* node, - size_t node_output_number) - : m_element_type(element_type), - m_partial_shape(pshape) { - m_name_it = m_names.cend(); +std::shared_ptr& TensorExtension::get_descriptor_ptr(Tensor& tensor) { + return tensor.m_impl; } -void ov::descriptor::Tensor::invalidate_values() { +bool TensorExtension::Equal::operator()(const std::shared_ptr& lhs, const std::shared_ptr& rhs) const { + return TensorExtension::get_descriptor(*lhs).pointer_hash() == TensorExtension::get_descriptor(*rhs).pointer_hash(); +} + +size_t TensorExtension::Hasher::operator()(const std::shared_ptr& tensor) const { + return get_descriptor(*tensor).pointer_hash(); +} + +// --- Tensor +Tensor::Tensor(const element::Type& element_type, + const PartialShape& pshape, + const std::unordered_set& names) + : m_impl(std::make_shared(element_type, pshape, names)) {} + +Tensor::Tensor(const element::Type& element_type, const PartialShape& pshape, ov::Node* node, size_t) + : m_impl(std::make_shared(element_type, pshape, std::unordered_set{})) {} + +void Tensor::invalidate_values() { if (ov::skip_invalidation(*this)) return; m_upper_value = {}; @@ -35,110 +162,110 @@ void ov::descriptor::Tensor::invalidate_values() { m_value_symbol.clear(); } -void ov::descriptor::Tensor::set_lower_value(const ov::Tensor& value) { +void Tensor::set_lower_value(const ov::Tensor& value) { OPENVINO_ASSERT(static_cast(value)); - OPENVINO_ASSERT(m_partial_shape.same_scheme(value.get_shape())); - OPENVINO_ASSERT(m_element_type == value.get_element_type()); + OPENVINO_ASSERT(get_partial_shape().same_scheme(value.get_shape())); + OPENVINO_ASSERT(get_element_type() == value.get_element_type()); m_lower_value = value; } -void ov::descriptor::Tensor::set_upper_value(const ov::Tensor& value) { +void Tensor::set_upper_value(const ov::Tensor& value) { OPENVINO_ASSERT(static_cast(value)); - OPENVINO_ASSERT(m_partial_shape.same_scheme(value.get_shape())); - OPENVINO_ASSERT(m_element_type == value.get_element_type()); + OPENVINO_ASSERT(get_partial_shape().same_scheme(value.get_shape())); + OPENVINO_ASSERT(get_element_type() == value.get_element_type()); m_upper_value = value; } -void ov::descriptor::Tensor::set_value_symbol(const TensorSymbol& value_symbol) { +void Tensor::set_value_symbol(const TensorSymbol& value_symbol) { const auto& symbols_size = value_symbol.size(); if (symbols_size == 0) { m_value_symbol.clear(); } else { - OPENVINO_ASSERT(m_partial_shape.is_static()); - OPENVINO_ASSERT(shape_size(m_partial_shape.to_shape()) == symbols_size); + OPENVINO_ASSERT(get_partial_shape().is_static()); + OPENVINO_ASSERT(shape_size(get_partial_shape().to_shape()) == symbols_size); m_value_symbol = value_symbol; } } -const ov::Shape& ov::descriptor::Tensor::get_shape() const { - AtomicGuard lock(m_shape_changing); - if (m_shape_changed) { - m_shape = m_partial_shape.to_shape(); - m_shape_changed = false; - } - return m_shape; +const ov::Tensor& Tensor::get_lower_value() const { + return m_lower_value; } -size_t ov::descriptor::Tensor::size() const { - const bool bitwidth_less_than_byte = m_element_type.bitwidth() < 8; - return bitwidth_less_than_byte ? (shape_size(get_shape()) * m_element_type.bitwidth() + 7) >> 3 - : (shape_size(get_shape()) * m_element_type.size()); +const ov::Tensor& Tensor::get_upper_value() const { + return m_upper_value; } -const std::unordered_set& ov::descriptor::Tensor::get_names() const { - return m_names; +TensorSymbol Tensor::get_value_symbol() const { + return m_value_symbol; } -const std::string& ov::descriptor::Tensor::get_any_name() const { - if (m_name_it == m_names.cend()) { - OPENVINO_THROW("Attempt to get a name for a Tensor without names"); - } - return *m_name_it; +bool Tensor::has_and_set_bound() const { + return m_upper_value && m_lower_value && m_upper_value.data() == m_lower_value.data(); } -void ov::descriptor::Tensor::set_names(const std::unordered_set& names) { - m_names = names; - m_name_it = m_names.cbegin(); - for (auto it = m_names.cbegin(); it != m_names.cend(); it++) { - if (*it < *m_name_it) - // Update any name - m_name_it = it; - } +const element::Type& Tensor::get_element_type() const { + return m_impl->get_element_type(); } -void ov::descriptor::Tensor::add_names(const std::unordered_set& names) { - for (const auto& name : names) { - auto res = m_names.insert(name); - if (m_name_it == m_names.end() || *res.first < *m_name_it) - // Update any name - m_name_it = res.first; - } +const PartialShape& Tensor::get_partial_shape() const { + return m_impl->get_partial_shape(); +} +const Shape& Tensor::get_shape() const { + return m_impl->get_shape(); } -void ov::descriptor::Tensor::clone_from(const ov::descriptor::Tensor& old) { - { - AtomicGuard lock(m_shape_changing); - m_partial_shape = old.get_partial_shape(); - m_shape_changed = true; - } - set_names(old.get_names()); - m_element_type = old.get_element_type(); - m_lower_value = old.get_lower_value(); - m_upper_value = old.get_upper_value(); - m_value_symbol = old.get_value_symbol(); - m_rt_info = old.get_rt_info(); +size_t Tensor::size() const { + return element::get_memory_size(get_element_type(), shape_size(get_shape())); } -void ov::descriptor::set_tensor_type(ov::descriptor::Tensor& tensor, - const element::Type& element_type, - const PartialShape& pshape) { - tensor.m_element_type = element_type; - AtomicGuard lock(tensor.m_shape_changing); - tensor.m_partial_shape = pshape; - tensor.m_shape_changed = true; +const std::unordered_set& Tensor::get_names() const { + return m_impl->get_names(); } -void ov::descriptor::set_element_type(ov::descriptor::Tensor& tensor, const element::Type& element_type) { - tensor.m_element_type = element_type; +const RTMap& Tensor::get_rt_info() const { + return m_impl->rt_map(); } -std::ostream& ov::descriptor::operator<<(std::ostream& out, const ov::descriptor::Tensor& tensor) { - std::string names; - for (const auto& name : tensor.get_names()) { - if (!names.empty()) - names += ", "; - names += name; - } - out << "Tensor(" << names << ")"; +RTMap& Tensor::get_rt_info() { + return m_impl->rt_map(); +} + +const std::string& Tensor::get_any_name() const { + return m_impl->get_any_name(); +} + +void Tensor::set_names(const std::unordered_set& names) { + m_impl->set_names(names); +} + +void Tensor::add_names(const std::unordered_set& names) { + m_impl->add_names(names); +} + +void Tensor::clone_from(const Tensor& other) { + m_impl->set_type_shape(other.get_element_type(), other.get_partial_shape()); + set_names(other.get_names()); + m_lower_value = other.get_lower_value(); + m_upper_value = other.get_upper_value(); + m_value_symbol = other.get_value_symbol(); + get_rt_info() = other.get_rt_info(); +} + +void set_tensor_type(Tensor& tensor, const element::Type& element_type, const PartialShape& pshape) { + TensorExtension::get_descriptor_ptr(tensor)->set_type_shape(element_type, pshape); +} + +void set_element_type(Tensor& tensor, const element::Type& element_type) { + TensorExtension::get_descriptor_ptr(tensor)->set_type_shape(element_type, tensor.get_partial_shape()); +} + +void copy_tensor_names(Tensor& dst, const Tensor& src) { + dst.set_names(TensorExtension::get_descriptor(src).get_all_names()); +} + +std::ostream& operator<<(std::ostream& out, const Tensor& tensor) { + out << "Tensor(" << util::join(tensor.get_names()) << ")"; return out; } +} // namespace descriptor +} // namespace ov diff --git a/src/core/src/except.cpp b/src/core/src/except.cpp index 6ce0568e04e387..7cddc5b3ec4a52 100644 --- a/src/core/src/except.cpp +++ b/src/core/src/except.cpp @@ -45,8 +45,12 @@ void ov::AssertFailure::create(const char* file, throw ov::AssertFailure(make_what(file, line, check_string, context_info, explanation)); } +ov::AssertFailure::~AssertFailure() = default; + void ov::NotImplemented::create(const char* file, int line, const std::string& explanation) { throw ov::NotImplemented(make_what(file, line, nullptr, default_msg, explanation)); } +ov::NotImplemented::~NotImplemented() = default; + const std::string ov::NotImplemented::default_msg{"Not Implemented"}; diff --git a/src/core/src/node.cpp b/src/core/src/node.cpp index ec9197a5a337cb..689e1c80af12a0 100644 --- a/src/core/src/node.cpp +++ b/src/core/src/node.cpp @@ -155,8 +155,8 @@ std::shared_ptr ov::Node::copy_with_new_inputs( for (auto& cdep : control_dependencies) { clone->add_control_dependency(cdep); } - for (size_t i = 0; i < get_output_size(); i++) { - clone->get_output_tensor(i).set_names(get_output_tensor(i).get_names()); + for (size_t i = 0; i < get_output_size(); ++i) { + descriptor::copy_tensor_names(clone->get_output_tensor(i), get_output_tensor(i)); } return clone; } @@ -218,9 +218,8 @@ ov::descriptor::Input& ov::Node::get_input_descriptor(size_t position) { ov::descriptor::Output& ov::Node::get_output_descriptor(size_t position) { while (m_outputs.size() <= position) { - size_t i = m_outputs.size(); - auto tensor_descriptor = make_shared(element::dynamic, PartialShape::dynamic(), this, i); - m_outputs.emplace_back(this, i, tensor_descriptor); + const auto i = m_outputs.size(); + m_outputs.emplace_back(this, i, make_shared(element::dynamic, PartialShape::dynamic())); } return m_outputs[position]; } @@ -468,8 +467,8 @@ ov::descriptor::Tensor& ov::Node::get_output_tensor(size_t i) const { ov::descriptor::Tensor& ov::Node::get_input_tensor(size_t i) const { OPENVINO_ASSERT(i < m_inputs.size(), idx_txt, i, out_of_range_txt); - descriptor::Input input = m_inputs[i]; - return input.get_tensor(); + auto& input = m_inputs[i]; + return input.get_output().get_tensor(); } size_t ov::Node::get_input_size() const { diff --git a/src/core/src/op/result.cpp b/src/core/src/op/result.cpp index 237d6bd7a2084a..97dc95a0e53f17 100644 --- a/src/core/src/op/result.cpp +++ b/src/core/src/op/result.cpp @@ -9,6 +9,7 @@ #include #include "itt.hpp" +#include "openvino/core/descriptor_tensor.hpp" namespace ov { namespace op { @@ -22,10 +23,8 @@ void Result::validate_and_infer_types() { OV_OP_SCOPE(v0_Result_validate_and_infer_types); NODE_VALIDATION_CHECK(this, get_input_size() == 1, "Argument has ", get_input_size(), " outputs (1 expected)."); - // Result doesn't change change in/out tensors - auto& output = get_output_descriptor(0); - auto& input = get_input_descriptor(0); - output.set_tensor_ptr(input.get_tensor_ptr()); + // Result shares input tensor but can have specific properties which are added/removed to input. + descriptor::set_shared_tensor(get_output_descriptor(0), get_input_descriptor(0)); } std::shared_ptr Result::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/pass/manager.cpp b/src/core/src/pass/manager.cpp index 9168292f5284c0..a6f1fc287e221c 100644 --- a/src/core/src/pass/manager.cpp +++ b/src/core/src/pass/manager.cpp @@ -104,8 +104,8 @@ class stopwatch { void stop() { if (m_active) { - auto end_time = m_clock.now(); - m_last_time = end_time - m_start_time; + m_end_time = m_clock.now(); + m_last_time = m_end_time - m_start_time; m_active = false; } } @@ -122,9 +122,17 @@ class stopwatch { return std::chrono::duration_cast(get_timer_value()).count(); } + std::chrono::nanoseconds get_start_time() const { + return std::chrono::duration_cast(m_start_time.time_since_epoch()); + } + + std::chrono::nanoseconds get_end_time() const { + return std::chrono::duration_cast(m_end_time.time_since_epoch()); + } + private: std::chrono::high_resolution_clock m_clock; - std::chrono::time_point m_start_time; + std::chrono::time_point m_start_time, m_end_time; bool m_active = false; std::chrono::nanoseconds m_last_time = std::chrono::high_resolution_clock::duration::zero(); }; @@ -221,6 +229,8 @@ class Profiler { if (is_pass_manager) { m_file << "m;" << name << ";" << stopwatch.get_timer_value().count() << ";" << (applied ? "1" : "0") << std::endl; + m_file << "m_start;" << name << ";" << stopwatch.get_start_time().count() << std::endl; + m_file << "m_end;" << name << ";" << stopwatch.get_end_time().count() << std::endl; } else { m_file << "t;" << name << ";" << m_manager_name << ";" << stopwatch.get_timer_value().count() << ";" << (applied ? "1" : "0") << std::endl; diff --git a/src/core/src/preprocess/pre_post_process.cpp b/src/core/src/preprocess/pre_post_process.cpp index d81d48082cde04..b408755a7d85a8 100644 --- a/src/core/src/preprocess/pre_post_process.cpp +++ b/src/core/src/preprocess/pre_post_process.cpp @@ -56,6 +56,10 @@ struct PrePostProcessor::PrePostProcessorImpl { PrePostProcessorImpl() = default; explicit PrePostProcessorImpl(const std::shared_ptr& f) : m_function(f) { OPENVINO_ASSERT(f, "Model can't be nullptr for PrePostProcessor"); + + // if IR version < 11, set compatibility mode + const auto names_mode = m_function->has_rt_info("version") && m_function->get_rt_info("version") < 11; + for (size_t i = 0; i < m_function->inputs().size(); ++i) { auto info = InputInfo(); info.m_impl->m_resolved_param = m_function->get_parameters()[i]; @@ -64,6 +68,7 @@ struct PrePostProcessor::PrePostProcessorImpl { for (size_t i = 0; i < m_function->outputs().size(); ++i) { auto info = OutputInfo(); info.m_impl->m_output_node = m_function->output(i); + info.m_impl->get_tensor_data()->set_names_compatibility_mode(names_mode); m_outputs.push_back(std::move(info)); } } diff --git a/src/core/src/preprocess/preprocess_impls.cpp b/src/core/src/preprocess/preprocess_impls.cpp index cbe18a78beb575..e0cdee2e76a140 100644 --- a/src/core/src/preprocess/preprocess_impls.cpp +++ b/src/core/src/preprocess/preprocess_impls.cpp @@ -6,6 +6,7 @@ #include "layout_utils.hpp" #include "openvino/core/descriptor_tensor.hpp" +#include "openvino/util/common_util.hpp" namespace ov { namespace preprocess { @@ -325,11 +326,9 @@ void InputInfo::InputInfoImpl::dump(std::ostream& str, //----------- OutputInfoImpl ---------- void OutputInfo::OutputInfoImpl::build(ov::ResultVector& results) { - std::shared_ptr result; auto node = m_output_node; - const auto start_out_node_names = node.get_tensor().get_names(); - node.get_tensor().set_names({}); - result = std::dynamic_pointer_cast(node.get_node_shared_ptr()); + const auto result = ov::as_type_ptr(node.get_node_shared_ptr()); + // Set result layout from 'model' information if (get_model_data()->is_layout_set()) { // Overwrite existing model's layout here (fix 74065) @@ -369,49 +368,46 @@ void OutputInfo::OutputInfoImpl::build(ov::ResultVector& results) { node = std::get<0>(action_result); post_processing_applied = true; } - // Restore tensor names - node.get_tensor().set_names(start_out_node_names); + auto orig_parent = result->get_input_source_output(0).get_node_shared_ptr(); - bool reset_orig_friendly_name = false; - if (!post_processing_applied) { - return; - } - if (orig_parent->get_output_size() == 1) { - node.get_node_shared_ptr()->set_friendly_name(orig_parent->get_friendly_name()); - reset_orig_friendly_name = true; + if (get_tensor_data()->get_names_compatibility_mode()) { + // Move result tensor names from previous input to new + const auto result_input_names = result->get_input_tensor(0).get_names(); + result->get_input_tensor(0).set_names({}); + node.get_tensor().set_names(result_input_names); + + if (!post_processing_applied) { + return; + } + + if (orig_parent->get_output_size() == 1) { + node.get_node_shared_ptr()->set_friendly_name(orig_parent->get_friendly_name()); + + // Reset friendly name of input node to avoid names collision + // when there is at a new node inserted by post-processing steps + // If no new nodes are inserted by post-processing, then we need to preserve friendly name of input + // as it's required for old API correct work + result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name(""); + } else if (node.get_node_shared_ptr() != orig_parent) { + // Result node is changed - add "." suffix + node.get_node_shared_ptr()->set_friendly_name( + orig_parent->get_friendly_name() + "." + + std::to_string(result->get_input_source_output(0).get_index())); + } + result->input(0).replace_source_output(node); + result->revalidate_and_infer_types(); } else if (node.get_node_shared_ptr() != orig_parent) { // Result node is changed - add "." suffix - node.get_node_shared_ptr()->set_friendly_name(orig_parent->get_friendly_name() + "." + - std::to_string(result->get_input_source_output(0).get_index())); - } + const auto suffix = std::string(".") + std::to_string(result->get_input_source_output(0).get_index()); + node.get_node_shared_ptr()->set_friendly_name(orig_parent->get_friendly_name() + suffix); - // Reset friendly name of input node to avoid names collision - // when there is at a new node inserted by post-processing steps - // If no new nodes are inserted by post-processing, then we need to preserve friendly name of input - // as it's required for old API correct work - if (reset_orig_friendly_name) { - result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name(""); + result->input(0).replace_source_output(node); + result->revalidate_and_infer_types(); } - // Create result - auto new_result = std::make_shared(node); - new_result->set_friendly_name(result->get_friendly_name()); - - // Preserve runtime info of original result - new_result->get_rt_info() = result->get_rt_info(); - new_result->input(0).get_rt_info() = result->input(0).get_rt_info(); - new_result->output(0).get_rt_info() = result->output(0).get_rt_info(); - // Update layout if (!context.layout().empty()) { - new_result->set_layout(context.layout()); - } - - for (auto& old_result : results) { - if (result == old_result) { - old_result = new_result; - break; - } + result->set_layout(context.layout()); } } @@ -439,7 +435,7 @@ void OutputInfo::OutputInfoImpl::dump(std::ostream& str) const { str << "Output "; if (!start_out_node_names.empty()) { - str << "\"" << *start_out_node_names.begin() << "\""; + str << "\"" << util::join(start_out_node_names) << "\""; } str << ":" << std::endl; str << " Model's data tensor: "; diff --git a/src/core/src/preprocess/preprocess_impls.hpp b/src/core/src/preprocess/preprocess_impls.hpp index 87d6b5456badc3..ee74c534c361fb 100644 --- a/src/core/src/preprocess/preprocess_impls.hpp +++ b/src/core/src/preprocess/preprocess_impls.hpp @@ -122,12 +122,21 @@ class TensorInfoImplBase { return m_layout; } + void set_names_compatibility_mode(const bool compatiblity_mode) { + m_names_compatiblity_mode = compatiblity_mode; + } + + const bool get_names_compatibility_mode() const { + return m_names_compatiblity_mode; + } + protected: element::Type m_type = element::dynamic; bool m_type_set = false; Layout m_layout = Layout(); bool m_layout_set = false; + bool m_names_compatiblity_mode = false; }; class OutputTensorInfo::OutputTensorInfoImpl : public TensorInfoImplBase {}; diff --git a/src/core/tests/any.cpp b/src/core/tests/any.cpp index 3914a617ff2982..33e928d60b872d 100644 --- a/src/core/tests/any.cpp +++ b/src/core/tests/any.cpp @@ -11,7 +11,8 @@ #include "common_test_utils/test_assertions.hpp" #include "openvino/core/runtime_attribute.hpp" -using namespace ov; +namespace ov { +namespace test { class DestructorTest { public: @@ -735,3 +736,70 @@ TEST_F(AnyTests, EmptyStringAsAny) { ASSERT_EQ(p.as>(), ref_f); ASSERT_EQ(p.as>(), ref_i); } + +template +class AnyConversionTest : public AnyTests {}; + +TYPED_TEST_SUITE_P(AnyConversionTest); + +using AnyArithmeticTypes = ::testing::Types; + +TYPED_TEST_P(AnyConversionTest, AnyToOtherValue) { + const TypeParam test_value{static_cast(23.15f)}; + const auto a = Any{test_value}; + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); +} + +REGISTER_TYPED_TEST_SUITE_P(AnyConversionTest, AnyToOtherValue); +INSTANTIATE_TYPED_TEST_SUITE_P(InstantiationName, AnyConversionTest, AnyArithmeticTypes); + +TEST_F(AnyTests, AnyAsOtherTypeIsIncosisoinet) { + // To show member `as` current behaviour. + // Maybe there should be two members `as` which return value + // and `cast` returns reference if casted type is same as Any underlying type + auto a = Any{10}; + + auto& a_int = a.as(); + auto& a_str = a.as(); + + EXPECT_EQ(a_int, 10); + EXPECT_EQ(a_str, "10"); + + a_int = 15; + EXPECT_EQ(a_int, 15); + // as string ref still has old value + EXPECT_EQ(a_str, "10"); + + a_str = "30"; + EXPECT_EQ(a_int, 15); + // as string ref has new value but is not in sync what any contains. + EXPECT_EQ(a_str, "30"); +} + +} // namespace test +} // namespace ov diff --git a/src/core/tests/frontend/frontend_manager.cpp b/src/core/tests/frontend/frontend_manager.cpp index 1e42de563ddbc6..31e643e7209bdb 100644 --- a/src/core/tests/frontend/frontend_manager.cpp +++ b/src/core/tests/frontend/frontend_manager.cpp @@ -479,3 +479,29 @@ TEST(FrontEndManagerTest, Exception_Safety_Input_Model_set_tensor_value) { TEST(FrontEndManagerTest, Exception_Safety_Input_Model_set_tensor_partial_value) { CHECK_EXCEPTION_INPUT_MODEL(input_model->set_tensor_partial_value({}, {}, {})) } + +#ifdef OPENVINO_CPP_VER_17 + +TEST(FrontEndManagerTest, testFEMDestroy_InputModelHolderUsingPath) { + InputModel::Ptr input_model; + { + std::shared_ptr model; + FrontEndManager fem; + fem.register_front_end("mock1", mock_fe_path()); + auto fe = fem.load_by_framework("mock1"); + input_model = fe->load(std::filesystem::path("test")); + model = fe->convert(input_model); + EXPECT_EQ(model->get_friendly_name(), "mock1_model"); + } + ASSERT_TRUE(input_model); +} + +TEST(FrontEndManagerTest, Exception_Safety_FrontEnd_Supported_By_Path) { + EXPECT_ANY_THROW({ + FrontEndManager fem; + fem.register_front_end("mock1", mock_fe_path()); + auto fe = fem.load_by_framework("mock1"); + fe->supported(std::filesystem::path("throw_now")); + }); +} +#endif diff --git a/src/core/tests/graph_rewrite.cpp b/src/core/tests/graph_rewrite.cpp index 3043e851aaf1d9..20955f5a5d6b1f 100644 --- a/src/core/tests/graph_rewrite.cpp +++ b/src/core/tests/graph_rewrite.cpp @@ -23,7 +23,7 @@ using namespace ov::pass; class TestPass : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("TestPass"); + OPENVINO_MATCHER_PASS_RTTI("TestPass"); TestPass() : MatcherPass() { auto divide = std::make_shared(element::f32, Shape{}, @@ -44,7 +44,7 @@ class TestPass : public ov::pass::MatcherPass { class GatherNodesPass : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("GatherNodesPass"); + OPENVINO_MATCHER_PASS_RTTI("GatherNodesPass"); GatherNodesPass(NodeVector& order) : MatcherPass() { ov::matcher_pass_callback callback = [&order](pattern::Matcher& m) { order.push_back(m.get_match_root()); @@ -187,6 +187,7 @@ TEST(GraphRewriteTest, MatcherPassCallbackDerived) { class TypeBasedTestPass : public ov::pass::MatcherPass { public: + OPENVINO_MATCHER_PASS_RTTI("TypeBasedTestPass"); TypeBasedTestPass() : MatcherPass() { auto divide = std::make_shared(std::make_shared(), std::make_shared()); @@ -207,6 +208,7 @@ class TypeBasedTestPass : public ov::pass::MatcherPass { class TypeBasedTestPassDerived : public ov::pass::MatcherPass { public: + OPENVINO_MATCHER_PASS_RTTI("TypeBasedTestPassDerived"); TypeBasedTestPassDerived() : MatcherPass() { auto divide = std::make_shared(std::make_shared(), std::make_shared()); @@ -388,7 +390,7 @@ TEST(PassConfigTest, Test1) { class CheckConsumers : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("CheckConsumers"); + OPENVINO_MATCHER_PASS_RTTI("CheckConsumers"); CheckConsumers() { ov::matcher_pass_callback callback = [](pattern::Matcher& m) -> bool { auto node = m.get_match_root(); diff --git a/src/core/tests/matcher_pass.cpp b/src/core/tests/matcher_pass.cpp index b845f496461193..ec9e5efbcdf9e8 100644 --- a/src/core/tests/matcher_pass.cpp +++ b/src/core/tests/matcher_pass.cpp @@ -21,6 +21,7 @@ using namespace std; class TestMatcherPass : public ov::pass::MatcherPass { public: + OPENVINO_MATCHER_PASS_RTTI("TestMatcherPass"); TestMatcherPass() { auto m_relu1 = ov::pass::pattern::wrap_type(pattern::consumers_count(1)); auto m_relu2 = ov::pass::pattern::wrap_type({m_relu1}); diff --git a/src/core/tests/pass/serialization/deterministicity.cpp b/src/core/tests/pass/serialization/deterministicity.cpp index 8441da501eb9bf..a93f092889d2a1 100644 --- a/src/core/tests/pass/serialization/deterministicity.cpp +++ b/src/core/tests/pass/serialization/deterministicity.cpp @@ -296,6 +296,47 @@ TEST_P(SerializationDeterministicityInputOutputTest, FromIrModel) { EXPECT_TRUE(files_equal(xml_2, xml_1)); } +#ifdef OPENVINO_CPP_VER_17 +TEST_P(SerializationDeterministicityInputOutputTest, FromOvModelBybPath) { + auto irVersion = GetParam(); + + std::shared_ptr modelRef; + { + auto parameter0 = std::make_shared(ov::element::f32, ov::Shape{1, 3, 22, 22}); + parameter0->set_friendly_name("input0"); + auto result0 = std::make_shared(parameter0); + result0->set_friendly_name("output0"); + auto parameter1 = std::make_shared(ov::element::f32, ov::Shape{1, 3, 22, 22}); + parameter1->set_friendly_name("input1"); + auto result1 = std::make_shared(parameter1); + result1->set_friendly_name("output1"); + modelRef = + std::make_shared(ov::NodeVector{result0, result1}, ov::ParameterVector{parameter0, parameter1}); + } + + auto& expected1 = modelRef; + const auto out_xml_path = std::filesystem::path(m_out_xml_path_1); + const auto out_bin_path = std::filesystem::path(m_out_bin_path_1); + ov::pass::Serialize(out_xml_path, out_bin_path, irVersion).run_on_model(modelRef); + auto expected2 = ov::test::readModel(m_out_xml_path_1, m_out_bin_path_1); + + ov::pass::Serialize(m_out_xml_path_2, m_out_bin_path_2, irVersion).run_on_model(expected2); + + EXPECT_EQ(input0Name, expected1->input(0).get_node()->get_friendly_name()); + EXPECT_EQ(input1Name, expected1->input(1).get_node()->get_friendly_name()); + EXPECT_EQ(output0Name, expected1->output(0).get_node()->get_friendly_name()); + EXPECT_EQ(output1Name, expected1->output(1).get_node()->get_friendly_name()); + EXPECT_EQ(input0Name, expected2->input(0).get_node()->get_friendly_name()); + EXPECT_EQ(input1Name, expected2->input(1).get_node()->get_friendly_name()); + EXPECT_EQ(output0Name, expected2->output(0).get_node()->get_friendly_name()); + EXPECT_EQ(output1Name, expected2->output(1).get_node()->get_friendly_name()); + + std::ifstream xml_1(m_out_xml_path_1, std::ios::in | std::ios::binary); + std::ifstream xml_2(m_out_xml_path_2, std::ios::in | std::ios::binary); + EXPECT_TRUE(files_equal(xml_1, xml_2)); +} +#endif + INSTANTIATE_TEST_SUITE_P(DeterministicityInputOutput, SerializationDeterministicityInputOutputTest, ::testing::Values(ov::pass::Serialize::Version::IR_V10, ov::pass::Serialize::Version::IR_V11)); diff --git a/src/core/tests/pass/serialization/serialize.cpp b/src/core/tests/pass/serialization/serialize.cpp index e45d5d1d1434ff..5cb1965feebdd7 100644 --- a/src/core/tests/pass/serialization/serialize.cpp +++ b/src/core/tests/pass/serialization/serialize.cpp @@ -74,6 +74,23 @@ TEST_P(SerializationTest, SaveModel) { }); } +#ifdef OPENVINO_CPP_VER_17 +TEST_P(SerializationTest, CompareFunctionsByPath) { + const auto out_xml_path = std::filesystem::path(m_out_xml_path); + const auto out_bin_path = std::filesystem::path(m_out_bin_path); + CompareSerialized([&out_xml_path, &out_bin_path](const auto& m) { + ov::pass::Serialize(out_xml_path, out_bin_path).run_on_model(m); + }); +} + +TEST_P(SerializationTest, SaveModelByPath) { + const auto out_xml_path = std::filesystem::path(m_out_xml_path); + CompareSerialized([&out_xml_path](const auto& m) { + ov::save_model(m, out_xml_path, false); + }); +} +#endif + INSTANTIATE_TEST_SUITE_P( IRSerialization, SerializationTest, diff --git a/src/core/tests/pass_config.cpp b/src/core/tests/pass_config.cpp index 566534d4f46ce4..15ebc71eef10a6 100644 --- a/src/core/tests/pass_config.cpp +++ b/src/core/tests/pass_config.cpp @@ -19,7 +19,7 @@ using namespace ov::pass; class RenameReLU : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("RanameReLU"); + OPENVINO_MATCHER_PASS_RTTI("RenameReLU"); RenameReLU() : MatcherPass() { auto relu = ov::pass::pattern::wrap_type(); ov::matcher_pass_callback callback = [](pattern::Matcher& m) { @@ -35,7 +35,7 @@ class RenameReLU : public ov::pass::MatcherPass { class RenameSigmoid : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("RenameSigmoid"); + OPENVINO_MATCHER_PASS_RTTI("RenameSigmoid"); RenameSigmoid() : MatcherPass() { auto sigmoid = pattern::wrap_type(); ov::matcher_pass_callback callback = [](pattern::Matcher& m) { @@ -259,7 +259,7 @@ TEST(PassConfig, EnableDisablePasses9) { class TestNestedMatcher : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("TestNestedMatcher"); + OPENVINO_MATCHER_PASS_RTTI("TestNestedMatcher"); TestNestedMatcher() : MatcherPass() { auto any_op = pattern::any_input(); ov::matcher_pass_callback callback = [this](pattern::Matcher& m) { diff --git a/src/core/tests/pattern.cpp b/src/core/tests/pattern.cpp index 050c36b65baad1..982e59b55f0f97 100644 --- a/src/core/tests/pattern.cpp +++ b/src/core/tests/pattern.cpp @@ -558,8 +558,8 @@ TEST(pattern, multiple_optionals_in_row) { // Pattern: auto in = wrap_type(); - auto pattern_convert = optional(in); - auto pattern_relu = optional(pattern_convert); + auto pattern_convert = pattern::optional(in); + auto pattern_relu = pattern::optional(pattern_convert); auto pattern_sigmoid = wrap_type({pattern_relu}); // Test: @@ -1255,4 +1255,4 @@ TEST(pattern, pattern_optional_root) { // Should perfectly match ASSERT_TRUE(tm.match(pattern_relu, model_relu)); -} \ No newline at end of file +} diff --git a/src/core/tests/preprocess.cpp b/src/core/tests/preprocess.cpp index 0cec67c3031288..99f2789b217b6d 100644 --- a/src/core/tests/preprocess.cpp +++ b/src/core/tests/preprocess.cpp @@ -57,6 +57,12 @@ static std::shared_ptr create_n_inputs(element::Type type, const PartialS return std::make_shared(res, params); } +namespace { +void set_model_as_v10(ov::Model& model) { + model.get_rt_info()["version"] = static_cast(10); +} +} // namespace + TEST(pre_post_process, simple_mean_scale) { auto f = create_simple_function(element::f32, Shape{1, 3, 2, 2}); auto p = PrePostProcessor(f); @@ -1531,7 +1537,7 @@ TEST(pre_post_process, postprocess_convert_element_type_explicit) { auto f = create_simple_function(element::f32, Shape{1, 3, 2, 2}); auto name = f->output().get_node_shared_ptr()->get_friendly_name(); auto name_last_op = f->get_results().front()->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name(); - auto old_names = f->output().get_tensor().get_names(); + auto old_names = std::unordered_set{"tensor_output1"}; auto p = PrePostProcessor(f); p.output().postprocess().convert_element_type(element::u8); @@ -1539,7 +1545,6 @@ TEST(pre_post_process, postprocess_convert_element_type_explicit) { EXPECT_EQ(f->get_results().size(), 1); EXPECT_EQ(f->get_results()[0]->get_element_type(), element::u8); EXPECT_EQ(f->output().get_tensor().get_names(), old_names); - EXPECT_EQ(old_names.count("tensor_output1"), 1); auto ops = f->get_ordered_ops(); auto res_count = std::count_if(ops.begin(), ops.end(), [](const std::shared_ptr& n) { return std::dynamic_pointer_cast(n) != nullptr; @@ -1548,9 +1553,37 @@ TEST(pre_post_process, postprocess_convert_element_type_explicit) { auto names_count = std::count_if(ops.begin(), ops.end(), [](std::shared_ptr n) { return n->output(0).get_tensor().get_names().count("tensor_output1") > 0; }); - EXPECT_EQ(names_count, 2); // last node + result referencing to it + EXPECT_EQ(names_count, 2); // result + node connected to it has same name referencing to it EXPECT_EQ(name, f->output().get_node_shared_ptr()->get_friendly_name()); - EXPECT_EQ(name_last_op, + EXPECT_EQ(name_last_op + ".0", + f->get_results().front()->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name()); +} + +TEST(pre_post_process, trivial_model_convert_element_type_explicit) { + const auto f = create_trivial(element::f32, Shape{1, 3, 2, 2}); + const auto name = f->output().get_node_shared_ptr()->get_friendly_name(); + const auto name_last_op = + f->get_results().front()->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name(); + const auto old_names = std::unordered_set{"tensor_output1"}; + const auto n = f->output().get_tensor().get_names(); + auto p = PrePostProcessor(f); + + p.output().postprocess().convert_element_type(element::u8); + p.build(); + EXPECT_EQ(f->get_results().size(), 1); + EXPECT_EQ(f->get_results()[0]->get_element_type(), element::u8); + EXPECT_THAT(f->output().get_tensor().get_names(), old_names); + const auto ops = f->get_ordered_ops(); + const auto res_count = std::count_if(ops.begin(), ops.end(), [](const std::shared_ptr& n) { + return std::dynamic_pointer_cast(n) != nullptr; + }); + EXPECT_EQ(res_count, 1); + const auto names_count = std::count_if(ops.begin(), ops.end(), [](std::shared_ptr n) { + return n->output(0).get_tensor().get_names().count("tensor_output1") > 0; + }); + EXPECT_EQ(names_count, 2); // result + node connected to it has same name referencing to it + EXPECT_EQ(name, f->output().get_node_shared_ptr()->get_friendly_name()); + EXPECT_EQ(name_last_op + ".0", f->get_results().front()->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name()); } @@ -1776,25 +1809,43 @@ TEST(pre_post_process, postprocess_convert_layout_invalid_dims_dyn_shape) { TEST(pre_post_process, postprocess_keep_friendly_names_compatibility) { auto f = create_simple_function(element::f32, Shape{1, 3, 10, 10}); - auto result_fr_name = f->get_results()[0]->get_friendly_name(); - auto node_before_result_old = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); - auto node_name = node_before_result_old->get_friendly_name(); + const auto result_fr_name = f->get_results()[0]->get_friendly_name(); + const auto node_before_result_old = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); + const auto node_name = node_before_result_old->get_friendly_name(); + set_model_as_v10(*f); auto p = PrePostProcessor(f); p.output().postprocess().convert_element_type(element::u8); f = p.build(); EXPECT_EQ(f->get_results()[0]->get_friendly_name(), result_fr_name); - auto node_before_result_new = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); + const auto node_before_result_new = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); // Compatibility check: verify that old name is assigned to new 'output' node EXPECT_EQ(node_before_result_new->get_friendly_name(), node_name); // Compatibility check: Verify that old name is not set for old 'output' node anymore EXPECT_NE(node_before_result_old->get_friendly_name(), node_name); } +TEST(pre_post_process, postprocess_keep_friendly_names) { + auto f = create_simple_function(element::f32, Shape{1, 3, 10, 10}); + auto result_fr_name = f->get_results()[0]->get_friendly_name(); + auto node_before_result_old = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); + auto node_name = node_before_result_old->get_friendly_name(); + auto p = PrePostProcessor(f); + p.output().postprocess().convert_element_type(element::u8); + f = p.build(); + EXPECT_EQ(f->get_results()[0]->get_friendly_name(), result_fr_name); + auto node_before_result_new = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); + // Compatibility check: verify that old name + index is assigned to new 'output' node + EXPECT_EQ(node_before_result_new->get_friendly_name(), node_name + ".0"); + // Compatibility check: Verify that old name is not changed + EXPECT_EQ(node_before_result_old->get_friendly_name(), node_name); +} + TEST(pre_post_process, postprocess_keep_friendly_names_compatibility_implicit) { auto f = create_simple_function(element::f32, Shape{1, 3, 10, 10}); auto result_fr_name = f->get_results()[0]->get_friendly_name(); auto node_before_result_old = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); auto node_name = node_before_result_old->get_friendly_name(); + set_model_as_v10(*f); auto p = PrePostProcessor(f); p.output().model().set_layout("NCHW"); p.output().tensor().set_layout("NHWC"); @@ -1807,6 +1858,21 @@ TEST(pre_post_process, postprocess_keep_friendly_names_compatibility_implicit) { EXPECT_NE(node_before_result_old->get_friendly_name(), node_name); } +TEST(pre_post_process, postprocess_keep_friendly_names_implicit) { + auto f = create_simple_function(element::f32, Shape{1, 3, 10, 10}); + const auto result_fr_name = f->get_results()[0]->get_friendly_name(); + const auto node_before_result_old = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); + const auto node_name = node_before_result_old->get_friendly_name(); + auto p = PrePostProcessor(f); + p.output().model().set_layout("NCHW"); + p.output().postprocess().convert_layout("NHWC"); + f = p.build(); + EXPECT_EQ(f->get_results()[0]->get_friendly_name(), result_fr_name); + const auto node_before_result_new = f->get_results()[0]->get_input_source_output(0).get_node_shared_ptr(); + EXPECT_EQ(node_before_result_new->get_friendly_name(), node_name + ".0"); + EXPECT_EQ(node_before_result_old->get_friendly_name(), node_name); +} + // --- PostProcess - convert color format --- TEST(pre_post_process, postprocess_convert_color_format_BGR_RGB) { auto f = create_simple_function(element::f32, Shape{5, 30, 20, 3}); @@ -2017,7 +2083,11 @@ TEST(pre_post_process, postprocess_one_node_many_outputs) { results.emplace_back(res); } auto model = std::make_shared(ResultVector{results}, ParameterVector{data1}); - EXPECT_EQ(model->output(0).get_tensor().get_names().count("tensor_Split0"), 1); + // Set tensor name to model output 0 + model->output(0).set_names({"output_split0"}); + EXPECT_EQ(model->output(0).get_tensor().get_names().count("output_split0"), 1); + // Result input has still tensor_split0 names from split op + EXPECT_EQ(model->output(0).get_node()->get_input_tensor(0).get_names().count("tensor_Split0"), 1); EXPECT_EQ(model->output(1).get_tensor().get_names().count("tensor_Split1"), 1); EXPECT_EQ(model->output(2).get_tensor().get_names().count("tensor_Split2"), 1); @@ -2026,9 +2096,12 @@ TEST(pre_post_process, postprocess_one_node_many_outputs) { p.output(2).tensor().set_element_type(element::f32); model = p.build(); EXPECT_EQ(model->get_results().size(), 3); - EXPECT_EQ(model->output(0).get_tensor().get_names().count("tensor_Split0"), 1); + // Tensor names on output is lost as origin named tensor is before convert op + // New result has different precision means different tensor. + EXPECT_EQ(model->output(0).get_tensor().get_names().count("tensor_Split0"), 0); + EXPECT_EQ(model->output(0).get_tensor().get_names().count("output_split0"), 1); EXPECT_EQ(model->output(1).get_tensor().get_names().count("tensor_Split1"), 1); - EXPECT_EQ(model->output(2).get_tensor().get_names().count("tensor_Split2"), 1); + EXPECT_EQ(model->output(2).get_tensor().get_names().count("tensor_Split2"), 0); EXPECT_EQ(model->get_results()[0]->input(0).get_source_output().get_node()->get_friendly_name(), "Split.0"); EXPECT_EQ(model->get_results()[1]->input(0).get_source_output().get_node()->get_friendly_name(), "Split"); EXPECT_EQ(model->get_results()[2]->input(0).get_source_output().get_node()->get_friendly_name(), "Split.2"); diff --git a/src/core/tests/type_prop/result.cpp b/src/core/tests/type_prop/result.cpp index f0c0eecc285004..9776768df052a0 100644 --- a/src/core/tests/type_prop/result.cpp +++ b/src/core/tests/type_prop/result.cpp @@ -7,8 +7,12 @@ #include "common_test_utils/type_prop.hpp" #include "openvino/op/constant.hpp" -using namespace std; -using namespace ov; +namespace ov { +namespace test { + +using ov::op::v0::Parameter; +using std::make_shared; +using testing::UnorderedElementsAre; TEST(type_prop, result) { const auto arg_shape = Shape{1, 2, 3, 4, 5}; @@ -51,3 +55,101 @@ TEST(type_prop, result_layout_invalid) { result->output(0).get_rt_info()[ov::LayoutAttribute::get_type_info_static()] = "NCHW"; // incorrect way ASSERT_THROW(result->get_layout(), ov::Exception); } + +using TypePropResultV0Test = TypePropOpTest; + +TEST_F(TypePropResultV0Test, set_specific_output_name_by_output) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("input")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("input")); + + result->output(0).set_names({"out"}); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "out")); +} + +TEST_F(TypePropResultV0Test, set_specific_output_name_by_tensor_desc) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("input")); + + result->get_output_tensor(0).set_names({"out"}); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "out")); +} + +TEST_F(TypePropResultV0Test, change_specific_output_name) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("input")); + + result->get_output_tensor(0).set_names({"out"}); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "out")); + + result->output(0).set_names({"new output"}); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("new output")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("new output")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "new output")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "new output")); +} + +TEST_F(TypePropResultV0Test, add_specific_output_name) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("input")); + + result->output(0).set_names({"out"}); + result->get_output_tensor(0).add_names({"extra output name", "o1"}); + result->output(0).add_names({"extra output name", "o2"}); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out", "extra output name", "o1", "o2")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out", "extra output name", "o1", "o2")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out", "extra output name", "o1", "o2")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), + UnorderedElementsAre("input", "out", "extra output name", "o1", "o2")); +} + +TEST_F(TypePropResultV0Test, preserve_specific_name_on_input_replace) { + const auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input a"}); + + const auto result = make_op(a); + result->output(0).set_names({"out"}); + + EXPECT_THAT(result->input(0).get_tensor().get_names(), UnorderedElementsAre("out", "input a")); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + + const auto b = std::make_shared(element::f32, PartialShape::dynamic()); + b->get_output_tensor(0).set_names({"input b"}); + + result->input(0).replace_source_output(b); + result->validate_and_infer_types(); + + EXPECT_THAT(result->input(0).get_tensor().get_names(), UnorderedElementsAre("input b", "out")); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input a")); +} +} // namespace test +} // namespace ov diff --git a/src/frontends/common/include/openvino/frontend/frontend.hpp b/src/frontends/common/include/openvino/frontend/frontend.hpp index 0035382fe20c5f..bc944c17dbc0dd 100644 --- a/src/frontends/common/include/openvino/frontend/frontend.hpp +++ b/src/frontends/common/include/openvino/frontend/frontend.hpp @@ -15,6 +15,10 @@ #include "openvino/frontend/input_model.hpp" #include "openvino/frontend/visibility.hpp" +#ifdef OPENVINO_CPP_VER_17 +# include +#endif + namespace ov { namespace frontend { /// \brief An interface for identifying a frontend for a particular framework. @@ -50,7 +54,12 @@ class FRONTEND_API FrontEnd { /// \return true if model recognized, false - otherwise. template inline bool supported(const Types&... vars) const { - return supported_impl({ov::Any(vars)...}); +#ifdef OPENVINO_CPP_VER_17 + if constexpr ((std::is_same_v || ...)) { + return supported_impl({path_as_str_or_forward(vars)...}); + } else +#endif + return supported_impl({ov::Any(vars)...}); } inline bool supported(const ov::AnyVector& vars) const { return supported_impl(vars); @@ -65,7 +74,12 @@ class FRONTEND_API FrontEnd { /// \return Loaded input model. template inline InputModel::Ptr load(const Types&... vars) const { - return load_impl({ov::Any{vars}...}); +#ifdef OPENVINO_CPP_VER_17 + if constexpr ((std::is_same_v || ...)) { + return load_impl({path_as_str_or_forward(vars)...}); + } else +#endif + return load_impl({ov::Any{vars}...}); } inline InputModel::Ptr load(const ov::AnyVector& vars) const { @@ -118,8 +132,16 @@ class FRONTEND_API FrontEnd { /// \brief Registers extension /// \param library_path path to library with ov::Extension + /// \{ void add_extension(const std::string& library_path); +#ifdef OPENVINO_CPP_VER_17 + void add_extension(const std::filesystem::path& library_path) { + add_extension(library_path.string()); + } +#endif + /// \} + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT /// \brief Registers extension @@ -162,6 +184,17 @@ class FRONTEND_API FrontEnd { private: static std::shared_ptr create_copy(const std::shared_ptr& ov_model, const std::shared_ptr& shared_object); + +#ifdef OPENVINO_CPP_VER_17 + template + static constexpr auto path_as_str_or_forward(T&& p) { + if constexpr (std::is_same_v>) { + return p.string(); + } else { + return std::forward(p); + } + } +#endif }; template <> diff --git a/src/frontends/common/src/extension/decoder_transformation.cpp b/src/frontends/common/src/extension/decoder_transformation.cpp index 561de1aacd79f9..4533fb89d85651 100644 --- a/src/frontends/common/src/extension/decoder_transformation.cpp +++ b/src/frontends/common/src/extension/decoder_transformation.cpp @@ -25,6 +25,7 @@ class CustomModelPass : public ov::pass::ModelPass { /// \brief Helper class to register user matcher pass initialization as a MatcherPass class CustomMatcherPass : public ov::pass::MatcherPass { public: + OPENVINO_MATCHER_PASS_RTTI("frontend::CustomMatcherPass"); explicit CustomMatcherPass(const std::function& matcher_pass_initializer) { matcher_pass_initializer(this); } diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp index 7c8b6e9d4b97ab..d7e250f9916302 100644 --- a/src/frontends/ir/src/ir_deserializer.cpp +++ b/src/frontends/ir/src/ir_deserializer.cpp @@ -10,6 +10,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/meta_data.hpp" #include "openvino/core/rt_info/weightless_caching_attributes.hpp" +#include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/loop.hpp" @@ -831,7 +832,9 @@ std::shared_ptr ov::XmlDeserializer::create_node(const std::vector(inputs[i].get_node_shared_ptr()) && + ov::element::Type_t::undefined == inputs[i].get_element_type()) OPENVINO_THROW(params.type, " layer ", params.name, @@ -947,10 +950,12 @@ std::shared_ptr ov::XmlDeserializer::create_node(const std::vector(pugixml::get_uint64_attr(dn, "size")), - static_cast(pugixml::get_uint64_attr(dn, "offset"))); + static_cast(pugixml::get_uint64_attr(dn, "offset")), + ov::element::Type(pugixml::get_str_attr(dn, "element_type"))); } } diff --git a/src/frontends/ir/tests/frontend_test_mmap.cpp b/src/frontends/ir/tests/frontend_test_mmap.cpp index 6b9ede14fa7d55..a58e3e29ff0a75 100644 --- a/src/frontends/ir/tests/frontend_test_mmap.cpp +++ b/src/frontends/ir/tests/frontend_test_mmap.cpp @@ -52,6 +52,42 @@ TEST_F(IRFrontendMMapTestsAdvanced, core_enable_mmap_property) { auto model = core.read_model(xmlFileName); auto rss_read = ov::test::utils::getVmRSSInKB(); + if (is_mmap != core.get_property("", ov::enable_mmap)) { + std::cout << "Test failed: core property is not set correctly" << std::endl; + exit(1); + } + + bool is_weights_read = (rss_read - rss_init) > REF_RSS; + if (is_mmap == is_weights_read) { + std::cerr << "Test failed: mmap is " << (is_mmap ? "enabled" : "disabled") << ", but weights are " + << (is_weights_read ? "read" : "not read") << " in RAM" << std::endl; + exit(1); + } + std::cerr << "Test passed" << std::endl; + exit(0); + }; + + for (const auto is_mmap : {true, false}) + // Run test in a separate process to not affect RAM values by previous tests + EXPECT_EXIT(test(is_mmap), ::testing::ExitedWithCode(0), "Test passed"); +} + +TEST_F(IRFrontendMMapTestsAdvanced, core_enable_mmap_property_user_config) { + // Test checks that with enabled `mmap` .bin file + // isn't read into RAM on `read_model` stage. + // Otherwise, with disabled `mmap` .bin file should + // be in RAM + + auto test = [&](const bool& is_mmap) { + auto rss_init = ov::test::utils::getVmRSSInKB(); + auto model = core.read_model(xmlFileName, {}, {{ov::enable_mmap(is_mmap)}}); + auto rss_read = ov::test::utils::getVmRSSInKB(); + + if (true != core.get_property("", ov::enable_mmap)) { + std::cout << "Test failed: core property changed by user configuration" << std::endl; + exit(1); + } + bool is_weights_read = (rss_read - rss_init) > REF_RSS; if (is_mmap == is_weights_read) { std::cerr << "Test failed: mmap is " << (is_mmap ? "enabled" : "disabled") << ", but weights are " diff --git a/src/frontends/jax/src/op/square.cpp b/src/frontends/jax/src/op/square.cpp new file mode 100644 index 00000000000000..268debb7992ba8 --- /dev/null +++ b/src/frontends/jax/src/op/square.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/jax/node_context.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/power.hpp" +#include "openvino/op/squeeze.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace jax { +namespace op { + +using namespace ov::op; + +OutputVector translate_square(const NodeContext& context) { + num_inputs_check(context, 1, 1); + auto x = context.get_input(0); + auto const_two = create_same_type_const_scalar(x, 2); + return {std::make_shared(x, const_two)}; +}; + +} // namespace op +} // namespace jax +} // namespace frontend +} // namespace ov diff --git a/src/frontends/jax/src/op_table.cpp b/src/frontends/jax/src/op_table.cpp index 98f22452c5afab..3ca58745bc1909 100644 --- a/src/frontends/jax/src/op_table.cpp +++ b/src/frontends/jax/src/op_table.cpp @@ -53,6 +53,7 @@ OP_CONVERTER(translate_reduce_window_sum); OP_CONVERTER(translate_reshape); OP_CONVERTER(translate_rsqrt); OP_CONVERTER(translate_slice); +OP_CONVERTER(translate_square); OP_CONVERTER(translate_squeeze); OP_CONVERTER(translate_transpose); @@ -92,6 +93,7 @@ const std::map get_supported_ops_jaxpr() { {"rsqrt", op::translate_rsqrt}, {"reshape", op::translate_reshape}, {"slice", op::translate_slice}, + {"square", op::translate_square}, {"sqrt", op::translate_1to1_match_1_input}, {"squeeze", op::translate_squeeze}, {"stop_gradient", op::skip_node}, diff --git a/src/frontends/onnx/frontend/src/core/tensor.hpp b/src/frontends/onnx/frontend/src/core/tensor.hpp index af4d299f9d45e7..a63cdfd1906bb0 100644 --- a/src/frontends/onnx/frontend/src/core/tensor.hpp +++ b/src/frontends/onnx/frontend/src/core/tensor.hpp @@ -313,7 +313,7 @@ class Tensor { } else { buffer = ext_data.load_external_data(m_model_dir); } - return std::vector(buffer->get_ptr(), buffer->get_ptr() + buffer->size()); + return std::vector(buffer->get_ptr(), buffer->get_ptr() + (buffer->size() / sizeof(T))); } const void* get_data_ptr() const { diff --git a/src/frontends/onnx/frontend/src/op/com.microsoft/matmul_integer_to_float.cpp b/src/frontends/onnx/frontend/src/op/com.microsoft/matmul_integer_to_float.cpp new file mode 100644 index 00000000000000..d7f1a69db9450c --- /dev/null +++ b/src/frontends/onnx/frontend/src/op/com.microsoft/matmul_integer_to_float.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "core/operator_set.hpp" +#include "exceptions.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/subtract.hpp" +#include "utils/common.hpp" + +using namespace ov::op; + +namespace ov { +namespace frontend { +namespace onnx { +namespace com_microsoft { +namespace opset_1 { + +ov::OutputVector matmulintegertofloat(const ov::frontend::onnx::Node& node) { + common::default_op_checks(node, 4); + // Original documentation: + // https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulintegertofloat + const auto inputs = node.get_ov_inputs(); + const auto& a_int = inputs[0]; + const auto& b_int = inputs[1]; + const auto& a_scale = inputs[2]; + const auto& b_scale = inputs[3]; + + ov::Output a_zero_point = + (inputs.size() > 4) + ? inputs[4] + : std::make_shared(a_int.get_element_type(), ov::Shape{}, std::vector{0}); + ov::Output b_zero_point = + (inputs.size() > 5) + ? inputs[5] + : std::make_shared(a_int.get_element_type(), ov::Shape{}, std::vector{0}); + + CHECK_VALID_NODE(node, + a_int.get_element_type() == ov::element::i8 || a_int.get_element_type() == ov::element::u8, + "Unsupported input A type. Expected int8 or uint8, got: ", + a_int.get_element_type()); + + CHECK_VALID_NODE(node, + b_int.get_element_type() == ov::element::i8 || b_int.get_element_type() == ov::element::u8, + "Unsupported input B type. Expected int8 or uint8, got: ", + b_int.get_element_type()); + + const auto a_dequantized = std::make_shared(a_int, a_zero_point); + const auto b_dequantized = std::make_shared(b_int, b_zero_point); + + const auto a_dequantized_converted = + std::make_shared(a_dequantized, a_scale.get_element_type()); + const auto b_dequantized_converted = + std::make_shared(b_dequantized, b_scale.get_element_type()); + + const auto a_scaled = std::make_shared(a_dequantized_converted, a_scale); + const auto b_scaled = std::make_shared(b_dequantized_converted, b_scale); + + const auto matmul_result = std::make_shared(a_scaled, b_scaled); + + if (inputs.size() > 6) { + auto& bias = inputs[6]; + CHECK_VALID_NODE(node, + bias.get_partial_shape().rank().get_length() == 1, + "Bias tensor must be 1D. Got shape: ", + bias.get_partial_shape()); + + const auto b_last_dim = b_int.get_partial_shape().get_shape().back(); + const auto bias_dim = bias.get_partial_shape()[0].get_length(); + CHECK_VALID_NODE(node, + static_cast(b_int.get_partial_shape().get_shape().back()) == bias_dim, + "Bias dimension must match the last dimension of B. Expected: ", + b_last_dim, + ", but got: ", + bias_dim); + + return {std::make_shared(matmul_result, bias)}; + } + + return {matmul_result}; +} + +ONNX_OP("MatMulIntegerToFloat", OPSET_SINCE(1), com_microsoft::opset_1::matmulintegertofloat, MICROSOFT_DOMAIN); + +} // namespace opset_1 +} // namespace com_microsoft +} // namespace onnx +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp index 47fcc7af60bf61..e3d3f4f1235504 100644 --- a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp +++ b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp @@ -231,6 +231,7 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { src_x.get_shape()[0] % block_size == 0, "DequantizeLinear doesn't support case when first dimension of X cannot be divided by block_size"); + // For further broadcasting scales and zp - reshape input to a shape [x.shape[0]/block_size, block_size, x.shape[1]] ov::Output broadcastable_x = op::util::reshape( src_x, Shape{static_cast(src_x.get_shape()[0]) / block_size, block_size, src_x.get_shape()[1]}); @@ -240,16 +241,14 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { const auto scale_type = scale.get_element_type(); if (inputs.size() > 2) { zp = inputs[2]; + zp = std::make_shared(zp, unsqueezed_axes); if (zp.get_element_type() != scale.get_element_type()) { zp = std::make_shared(zp, scale_type); - disable_constant_folding(zp.get_node_shared_ptr()); } - zp = std::make_shared(zp, unsqueezed_axes); } const auto& x = src_x.get_element_type() == scale_type ? broadcastable_x : std::make_shared(broadcastable_x, scale_type); - // For further broadcasting scales and zp - reshape input to a shape [x.shape[0]/block_size, block_size, x.shape[1]] // Adding additional dimension for broadcasting scale = std::make_shared(scale, unsqueezed_axes); diff --git a/src/frontends/onnx/tests/models/com.microsoft/matmul_integer_to_float.prototxt b/src/frontends/onnx/tests/models/com.microsoft/matmul_integer_to_float.prototxt new file mode 100644 index 00000000000000..e633480f2747f3 --- /dev/null +++ b/src/frontends/onnx/tests/models/com.microsoft/matmul_integer_to_float.prototxt @@ -0,0 +1,149 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +producer_version: "" +model_version: 0 +graph { + name: "test_matmul_integer_to_float" + + # Node definition for MatMulNBits operation + node { + input: "a" + input: "b" + input: "a_scale" + input: "b_scale" + input: "a_zero_point" + input: "b_zero_point" + input: "bias" + output: "c" + op_type: "MatMulIntegerToFloat" + domain: "com.microsoft" + } + input { + name: "a" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 3 + } + } + } + } + } + + input { + name: "b" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 2 + } + } + } + } + } + + input { + name: "a_scale" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + } + } + } + } + input { + name: "b_scale" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + } + } + } + } + + # Zero point for A (1D tensor) + input { + name: "a_zero_point" + type { + tensor_type { + elem_type: 3 # Tensor type for T1 (int8 or uint8) + shape { + dim { + dim_value: 3 # Per-column zero point for A (1D tensor) + } + } + } + } + } + + input { + name: "b_zero_point" + type { + tensor_type { + elem_type: 3 # Tensor type for T2 (int8 or uint8) + shape { + dim { + dim_value: 2 + } + } + } + } + } + + input { + name: "bias" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + } + } + } + } + + output { + name: "c" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} + +opset_import { + version: 7 +} + +opset_import { + version: 1 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_int16.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_int16.prototxt new file mode 100644 index 00000000000000..450e0d8ddc5325 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_int16.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 5 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "8" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 5 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 5 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 5 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_int4.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_int4.prototxt new file mode 100644 index 00000000000000..836d2fab8f0cb4 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_int4.prototxt @@ -0,0 +1,65 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + output: "Y" + name: "add_node1" + op_type: "Identity" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 22 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "offset", + value: "2" + } + external_data { + key: "length", + value: "2" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 22 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 22 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 21 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_int8.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_int8.prototxt new file mode 100644 index 00000000000000..d741a7e8168530 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_int8.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 3 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "4" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_uint16.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_uint16.prototxt new file mode 100644 index 00000000000000..8d144ebc054e40 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_uint16.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 4 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "8" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 4 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 4 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 4 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_uint4.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_uint4.prototxt new file mode 100644 index 00000000000000..df60a23ba29929 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_uint4.prototxt @@ -0,0 +1,65 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + output: "Y" + name: "add_node1" + op_type: "Identity" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 21 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "offset", + value: "2" + } + external_data { + key: "length", + value: "2" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 21 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 21 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 21 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_uint8.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_uint8.prototxt new file mode 100644 index 00000000000000..3f0cfa8406744e --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_uint8.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 2 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "4" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 2 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 2 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 2 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp index ca805f4c194856..333c06e24a8a40 100644 --- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp @@ -1456,3 +1456,35 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_fusedmatmul_2x5x3x6x4_2x6x3x4x test_case.run(); } + +OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmul_integer_to_float) { + const auto model = convert_model("com.microsoft/matmul_integer_to_float.onnx"); + auto test_case = ov::test::TestCase(model, s_device); + + const std::vector data_A{10, 20, 30, 40, 50, 60}; + const std::vector data_B{70, 80, 90, 100, 110, 120}; + + const std::vector a_scale{0.1f, 0.1f, 0.1f}; + const std::vector b_scale{0.1f, 0.1f}; + + const std::vector a_zero_point{1, 0, 2}; + const std::vector b_zero_point{0, 1}; + + const std::vector bias{0.5f, 0.25f}; + + const std::vector expected_output{55.59999847412109f, + 60.48000335693359f, + 136.6000061035156f, + 149.5800170898438f}; + + test_case.add_input(Shape{2, 3}, data_A); + test_case.add_input(Shape{3, 2}, data_B); + test_case.add_input(Shape{3}, a_scale); + test_case.add_input(Shape{2}, b_scale); + test_case.add_input(Shape{3}, a_zero_point); + test_case.add_input(Shape{2}, b_zero_point); + test_case.add_input(Shape{2}, bias); + test_case.add_expected_output(Shape{2, 2}, expected_output); + + test_case.run(); +} diff --git a/src/frontends/onnx/tests/onnx_reader_external_data.cpp b/src/frontends/onnx/tests/onnx_reader_external_data.cpp index 103099d08bced9..7d0fbbf6240f55 100644 --- a/src/frontends/onnx/tests/onnx_reader_external_data.cpp +++ b/src/frontends/onnx/tests/onnx_reader_external_data.cpp @@ -255,4 +255,80 @@ TEST_P(OnnxFeMmapFixture, onnx_external_data_in_constant_node) { test_case.run(); } +TEST_P(OnnxFeMmapFixture, onnx_external_data_int16) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_int16.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({-100}); + test_case.add_expected_output(Shape{2, 2}, {-100, 16156, -100, 16284}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_uint16) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_uint16.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({100}); + test_case.add_expected_output(Shape{2, 2}, {100, 16356, 100, 16484}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_int8) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_int8.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({-100}); + test_case.add_expected_output(Shape{2, 2}, {-100, 106, -100, -37}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_uint8) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_uint8.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({100}); + test_case.add_expected_output(Shape{2, 2}, {100, 100, 228, 163}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_int4) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_int4.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_expected_output(Shape{2, 2}, {static_cast(0x80), 0x3f}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_uint4) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_uint4.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_expected_output(Shape{2, 2}, {0x80, 0x3f}); + + test_case.run(); +} + INSTANTIATE_TEST_SUITE_P(OnnxFeMMapReadModel, OnnxFeMmapFixture, ::testing::Bool()); diff --git a/src/frontends/paddle/src/internal/pass/transform_fakequantize.hpp b/src/frontends/paddle/src/internal/pass/transform_fakequantize.hpp index 6d45edd8ea818a..19abfcbf260d73 100644 --- a/src/frontends/paddle/src/internal/pass/transform_fakequantize.hpp +++ b/src/frontends/paddle/src/internal/pass/transform_fakequantize.hpp @@ -14,7 +14,7 @@ namespace pass { class TransformFakeQuantize : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::paddle::pass::TransformFakeQuantize"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::paddle::pass::TransformFakeQuantize"); TransformFakeQuantize(); private: @@ -23,4 +23,4 @@ class TransformFakeQuantize : public ov::pass::MatcherPass { } // namespace pass } // namespace paddle } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/paddle/src/internal/pass/transform_if.hpp b/src/frontends/paddle/src/internal/pass/transform_if.hpp index 98c66800d6fea6..f71c2b026fd3e4 100644 --- a/src/frontends/paddle/src/internal/pass/transform_if.hpp +++ b/src/frontends/paddle/src/internal/pass/transform_if.hpp @@ -14,7 +14,7 @@ namespace pass { class TransformIf : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::paddle::pass::TransformIf"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::paddle::pass::TransformIf"); TransformIf(std::vector> functions); private: diff --git a/src/frontends/paddle/src/internal/pass/transform_tensorarray.hpp b/src/frontends/paddle/src/internal/pass/transform_tensorarray.hpp index c992bf1eefa4c9..227ce708ad6503 100644 --- a/src/frontends/paddle/src/internal/pass/transform_tensorarray.hpp +++ b/src/frontends/paddle/src/internal/pass/transform_tensorarray.hpp @@ -14,7 +14,7 @@ namespace pass { class TransformTensorArray : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::paddle::pass::TransformTensorArray"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::paddle::pass::TransformTensorArray"); TransformTensorArray(std::vector> functions); private: diff --git a/src/frontends/paddle/src/internal/pass/transform_while.hpp b/src/frontends/paddle/src/internal/pass/transform_while.hpp index de6f381222a554..9a604f520168fe 100644 --- a/src/frontends/paddle/src/internal/pass/transform_while.hpp +++ b/src/frontends/paddle/src/internal/pass/transform_while.hpp @@ -14,7 +14,7 @@ namespace pass { class TransformWhile : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::paddle::pass::TransformWhile"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::paddle::pass::TransformWhile"); TransformWhile(std::vector> functions); private: diff --git a/src/frontends/pytorch/src/op/linear.cpp b/src/frontends/pytorch/src/op/linear.cpp index 2d01dee84c151b..4a5ad4a6b0e73b 100644 --- a/src/frontends/pytorch/src/op/linear.cpp +++ b/src/frontends/pytorch/src/op/linear.cpp @@ -5,6 +5,10 @@ #include "openvino/frontend/pytorch/node_context.hpp" #include "openvino/op/add.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/subtract.hpp" #include "utils.hpp" namespace ov { @@ -12,6 +16,8 @@ namespace frontend { namespace pytorch { namespace op { +using namespace ov::op; + OutputVector translate_linear(const NodeContext& context) { // schema: aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor num_inputs_check(context, 2, 3); @@ -20,17 +26,91 @@ OutputVector translate_linear(const NodeContext& context) { if (weight.get_element_type() == element::f16 || weight.get_element_type() == element::bf16) { // In case of patched linear it can have mixed fp16/bf16 and fp32 input type. // In other cases these conversion is not required. - weight = context.mark_node(std::make_shared(weight, x)); + weight = context.mark_node(std::make_shared(weight, x)); } - auto matmul = context.mark_node(std::make_shared(x, weight, false, true)); + auto matmul = context.mark_node(std::make_shared(x, weight, false, true)); if (!context.input_is_none(2)) { auto bias = context.get_input(2); if (bias.get_element_type() == element::f16 || bias.get_element_type() == element::bf16) { // Same reason as for weight. - bias = context.mark_node(std::make_shared(bias, x)); + bias = context.mark_node(std::make_shared(bias, x)); + } + matmul = context.mark_node(std::make_shared(matmul, bias)); + } + return {matmul}; +}; + +namespace { +uint32_t rearrange_awq_bits(uint32_t num) { + uint32_t result = 0; + uint32_t mask = 0xF; + + // Rearrange each 4-bit part in accordance with the AWQ i32->u4 unpacking schema + result |= (num & (mask << 0)) << 0; + result |= (num & (mask << 16)) >> 12; + result |= (num & (mask << 4)) << 4; + result |= (num & (mask << 20)) >> 8; + result |= (num & (mask << 8)) << 8; + result |= (num & (mask << 24)) >> 4; + result |= (num & (mask << 12)) << 12; + result |= (num & (mask << 28)) >> 0; + + return result; +} + +Output rearrange_constant(const Output& c, uint32_t groups) { + auto constant = std::dynamic_pointer_cast(c.get_node_shared_ptr()); + FRONT_END_OP_CONVERSION_CHECK(constant, "weight must be Constant."); + auto src = constant->get_data_ptr(); + auto initial_shape = constant->get_shape(); + FRONT_END_OP_CONVERSION_CHECK(initial_shape.size() == 2, "Only 2D constants are supported."); + auto new_shape = Shape{initial_shape[0] / groups, groups, initial_shape[1] * 8}; + auto new_qweight = std::make_shared(element::u4, new_shape); + auto dst = const_cast(reinterpret_cast(new_qweight->get_data_ptr())); + for (size_t i = 0; i < shape_size(constant->get_shape()); i++) { + dst[i] = rearrange_awq_bits(src[i]); + } + return new_qweight; +} +} // namespace + +OutputVector translate_linear_awq(const NodeContext& context) { + num_inputs_check(context, 4, 7); + auto x = context.get_input(0); + auto qweight = context.get_input(1); + auto qzeros = context.get_input(2); + auto scales = context.get_input(3); + auto groups = context.const_input(4); + auto bits = context.const_input(5); + + FRONT_END_OP_CONVERSION_CHECK(bits == 4, "Only 4 bit AWQ is supported."); + + auto new_qweight = rearrange_constant(qweight, static_cast(groups)); + auto new_qzeros = rearrange_constant(qzeros, 1); + new_qweight = context.mark_node(std::make_shared(new_qweight, scales.get_element_type())); + new_qzeros = context.mark_node(std::make_shared(new_qzeros, scales.get_element_type())); + + auto w_s = context.mark_node(std::make_shared(new_qweight, new_qzeros)); + FRONT_END_OP_CONVERSION_CHECK(scales.get_partial_shape().is_static(), "Scales must be constant."); + auto scales_shape = scales.get_shape(); + auto new_scales_shape = + v0::Constant::create(element::i32, {3}, std::vector{scales_shape[0], 1, scales_shape[1]}); + scales = context.mark_node(std::make_shared(scales, new_scales_shape, false)); + auto weight = context.mark_node(std::make_shared(w_s, scales)); + auto out_shape = + v0::Constant::create(element::i32, {2}, std::vector{static_cast(qweight.get_shape()[0]), -1}); + weight = context.mark_node(std::make_shared(weight, out_shape, false)); + weight = context.mark_node(std::make_shared(weight, x)); + + auto matmul = context.mark_node(std::make_shared(x, weight, false, false)); + if (!context.input_is_none(6)) { + auto bias = context.get_input(6); + + if (bias.get_element_type() == element::f16 || bias.get_element_type() == element::bf16) { + bias = context.mark_node(std::make_shared(bias, x)); } - matmul = context.mark_node(std::make_shared(matmul, bias)); + matmul = context.mark_node(std::make_shared(matmul, bias)); } return {matmul}; }; diff --git a/src/frontends/pytorch/src/op/stft.cpp b/src/frontends/pytorch/src/op/stft.cpp index b7e4858c2f8fcc..d1fe4f9f15828b 100644 --- a/src/frontends/pytorch/src/op/stft.cpp +++ b/src/frontends/pytorch/src/op/stft.cpp @@ -10,6 +10,7 @@ #include "openvino/op/convert_like.hpp" #include "openvino/op/divide.hpp" #include "openvino/op/shape_of.hpp" +#include "openvino/op/sqrt.hpp" #include "openvino/op/unsqueeze.hpp" #include "utils.hpp" @@ -66,8 +67,6 @@ OutputVector translate_stft(const NodeContext& context) { if (!context.input_is_none(5)) { normalized = context.const_input(5); } - PYTORCH_OP_CONVERSION_CHECK(!normalized, - "aten::stft conversion is currently supported with normalized=False only."); bool onesided = true; if (!context.input_is_none(6)) { @@ -85,7 +84,15 @@ OutputVector translate_stft(const NodeContext& context) { // Perform STFT constexpr bool transpose_frames = true; auto stft = context.mark_node(std::make_shared(input, window, n_fft, hop_length, transpose_frames)); - return {stft}; + + if (normalized) { + const auto nfft_convert = context.mark_node(std::make_shared(n_fft, stft)); + const auto divisor = context.mark_node(std::make_shared(nfft_convert)); + const auto norm_stft = context.mark_node(std::make_shared(stft, divisor)); + return {norm_stft}; + } else { + return {stft}; + } }; } // namespace op } // namespace pytorch diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 7307833430411f..a73c13814d7663 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -61,7 +61,6 @@ OP_CONVERTER(translate_clamp); OP_CONVERTER(translate_col2im); OP_CONVERTER(translate_constant); OP_CONVERTER(translate_conv_transposend); -OP_CONVERTER(translate_conv1d_ext); OP_CONVERTER(translate_convnd); OP_CONVERTER(translate_convolution); OP_CONVERTER(translate_convolution_mode); @@ -77,7 +76,6 @@ OP_CONVERTER(translate_dot); OP_CONVERTER(translate_elu); OP_CONVERTER(translate_embedding); OP_CONVERTER(translate_embedding_bag); -OP_CONVERTER(translate_embedding_ext); OP_CONVERTER(translate_empty); OP_CONVERTER(translate_empty_like); OP_CONVERTER(translate_erf); @@ -325,6 +323,10 @@ OP_CONVERTER(translate_unbind_int_fx); OP_CONVERTER(translate_unique2); OP_CONVERTER(translate_zeros_fx); OP_CONVERTER(translate_zeros_like_fx); +// Extensions +OP_CONVERTER(translate_conv1d_ext); +OP_CONVERTER(translate_embedding_ext); +OP_CONVERTER(translate_linear_awq); } // namespace op @@ -699,6 +701,7 @@ const std::unordered_map get_supported_ops_ts() { {"aten::zero", op::translate_zeros_like}, {"aten::zeros", op::translate_zeros}, {"aten::zeros_like", op::translate_zeros_like}, + {"ov_ext::awq_gemm", op::translate_linear_awq}, {"ov_ext::embedding", op::translate_embedding_ext}, {"ov_ext::conv1d", op::translate_conv1d_ext}, {"ov_ext::linear", op::translate_linear}, @@ -864,7 +867,6 @@ const std::unordered_map get_supported_ops_fx() { {"aten.hardtanh.default", op::translate_hardtanh}, {"aten.hardtanh_.default", op::inplace_op}, {"aten.index.Tensor", op::translate_index_fx}, - {"aten._unsafe_index.Tensor", op::translate_index_fx}, {"aten.index_select.default", op::translate_index_select}, {"aten.isfinite.default", op::translate_1to1_match_1_inputs}, {"aten.isinf.default", op::translate_1to1_match_1_inputs}, diff --git a/src/frontends/pytorch/src/transforms/append_list_unpack_replacer.hpp b/src/frontends/pytorch/src/transforms/append_list_unpack_replacer.hpp index d3ecd8b28fc636..84b28c8c7e21d3 100644 --- a/src/frontends/pytorch/src/transforms/append_list_unpack_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/append_list_unpack_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class AppendListUnpackReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::AppendListUnpackReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::AppendListUnpackReplacer"); AppendListUnpackReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/aten_cat_replacer.hpp b/src/frontends/pytorch/src/transforms/aten_cat_replacer.hpp index 8041e282445353..ef2d06da848132 100644 --- a/src/frontends/pytorch/src/transforms/aten_cat_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/aten_cat_replacer.hpp @@ -15,11 +15,11 @@ namespace pass { // This transformation replaces pattern prim::ListConstruct->aten::append{none or many}->aten::cat class AtenCatToConcat : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenCatToConcat"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::AtenCatToConcat"); AtenCatToConcat(); }; } // namespace pass } // namespace pytorch } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/aten_getitem_replacer.hpp b/src/frontends/pytorch/src/transforms/aten_getitem_replacer.hpp index db99e2d65b2ef1..3d6de2c76b2c83 100644 --- a/src/frontends/pytorch/src/transforms/aten_getitem_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/aten_getitem_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class AtenGetItemReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenGetItemReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::AtenGetItemReplacer"); AtenGetItemReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/aten_index_put_replacer.hpp b/src/frontends/pytorch/src/transforms/aten_index_put_replacer.hpp index e74cf40e057bf2..b641ca3146d0c0 100644 --- a/src/frontends/pytorch/src/transforms/aten_index_put_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/aten_index_put_replacer.hpp @@ -15,7 +15,7 @@ namespace pass { class PYTORCH_API AtenIndexPutReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenIndexPutReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::AtenIndexPutReplacer"); AtenIndexPutReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp b/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp index b9a034e3a2721f..67afefbef53f57 100644 --- a/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp @@ -16,7 +16,7 @@ namespace pass { // This transformation replaces pattern prim::ListConstruct->aten::index class PYTORCH_API AtenIndexToSelect : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenIndexToSelect"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::AtenIndexToSelect"); AtenIndexToSelect(); }; diff --git a/src/frontends/pytorch/src/transforms/aten_stack_list_construct_replacer.hpp b/src/frontends/pytorch/src/transforms/aten_stack_list_construct_replacer.hpp index ab7376619c4469..51b9832c2e35ae 100644 --- a/src/frontends/pytorch/src/transforms/aten_stack_list_construct_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/aten_stack_list_construct_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class AtenStackListConstructReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenStackListConstructReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::AtenStackListConstructReplacer"); AtenStackListConstructReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp b/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp index 413c9f526214ae..ba792cbbb820af 100644 --- a/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp +++ b/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp @@ -14,11 +14,11 @@ namespace pass { class AtenEinsumListConstructReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenEinsumListConstructReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::AtenEinsumListConstructReplacer"); AtenEinsumListConstructReplacer(); }; } // namespace pass } // namespace pytorch } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/index_loop_getitem_replacer.hpp b/src/frontends/pytorch/src/transforms/index_loop_getitem_replacer.hpp index 89627723c3d515..dac4bdafa09d27 100644 --- a/src/frontends/pytorch/src/transforms/index_loop_getitem_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/index_loop_getitem_replacer.hpp @@ -18,7 +18,7 @@ namespace pass { */ class IndexLoopGetitemReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::IndexLoopGetitemReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::IndexLoopGetitemReplacer"); IndexLoopGetitemReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/irfftn_complex_replacer.hpp b/src/frontends/pytorch/src/transforms/irfftn_complex_replacer.hpp index 3aa6991aed5d4d..d3a5738a82ddbf 100644 --- a/src/frontends/pytorch/src/transforms/irfftn_complex_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/irfftn_complex_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class IRFFTNComplexReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::IRFFTNComplexReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::IRFFTNComplexReplacer"); IRFFTNComplexReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/listconstruct_replacer.hpp b/src/frontends/pytorch/src/transforms/listconstruct_replacer.hpp index 4b265d58d24541..49dac1f83d112a 100644 --- a/src/frontends/pytorch/src/transforms/listconstruct_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/listconstruct_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class ListConstructReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::ListConstructReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::ListConstructReplacer"); ListConstructReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/min_max_prim_list_construct_replacer.hpp b/src/frontends/pytorch/src/transforms/min_max_prim_list_construct_replacer.hpp index 371b3be7ff7cd0..f8dc9a2037a130 100644 --- a/src/frontends/pytorch/src/transforms/min_max_prim_list_construct_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/min_max_prim_list_construct_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class MinMaxPrimListConstructReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::MinMaxPrimListConstructReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::MinMaxPrimListConstructReplacer"); MinMaxPrimListConstructReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/prim_list_construct_pad.hpp b/src/frontends/pytorch/src/transforms/prim_list_construct_pad.hpp index 5e5322969f5285..bbd494f2b97b98 100644 --- a/src/frontends/pytorch/src/transforms/prim_list_construct_pad.hpp +++ b/src/frontends/pytorch/src/transforms/prim_list_construct_pad.hpp @@ -14,11 +14,11 @@ namespace pass { class PrimListConstructPadReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::PrimListConstructPadReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::PrimListConstructPadReplacer"); PrimListConstructPadReplacer(); }; } // namespace pass } // namespace pytorch } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/prim_list_unpack_replacer.hpp b/src/frontends/pytorch/src/transforms/prim_list_unpack_replacer.hpp index 81521716a23430..449adc8a78779d 100644 --- a/src/frontends/pytorch/src/transforms/prim_list_unpack_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/prim_list_unpack_replacer.hpp @@ -14,11 +14,11 @@ namespace pass { class PrimListUnpackReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::PrimListUnpackReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::PrimListUnpackReplacer"); PrimListUnpackReplacer(); }; } // namespace pass } // namespace pytorch } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/quantized_node_remover.hpp b/src/frontends/pytorch/src/transforms/quantized_node_remover.hpp index e4ca3d5806d494..2ae26866f4fabb 100644 --- a/src/frontends/pytorch/src/transforms/quantized_node_remover.hpp +++ b/src/frontends/pytorch/src/transforms/quantized_node_remover.hpp @@ -20,7 +20,7 @@ namespace pass { */ class QuantizedNodeRemover : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::QuantizedNodeRemover"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::QuantizedNodeRemover"); QuantizedNodeRemover(); }; diff --git a/src/frontends/pytorch/src/transforms/remove_packing_ops.hpp b/src/frontends/pytorch/src/transforms/remove_packing_ops.hpp index 773100dfc35af9..a236a3fd081568 100644 --- a/src/frontends/pytorch/src/transforms/remove_packing_ops.hpp +++ b/src/frontends/pytorch/src/transforms/remove_packing_ops.hpp @@ -17,7 +17,7 @@ namespace pass { */ class MovePackThroughLstm : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::MovePackThroughLstm"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::MovePackThroughLstm"); MovePackThroughLstm(); }; @@ -26,7 +26,7 @@ class MovePackThroughLstm : public ov::pass::MatcherPass { */ class RemovePackingOps : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::RemovePackingOps"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::RemovePackingOps"); RemovePackingOps(); }; diff --git a/src/frontends/pytorch/src/transforms/reverseprop_resolver.hpp b/src/frontends/pytorch/src/transforms/reverseprop_resolver.hpp index a26249e4841d4b..8bc3109e479cf5 100644 --- a/src/frontends/pytorch/src/transforms/reverseprop_resolver.hpp +++ b/src/frontends/pytorch/src/transforms/reverseprop_resolver.hpp @@ -17,7 +17,7 @@ namespace pass { */ class ReversepropResolver : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::ReversepropResolver"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::ReversepropResolver"); ReversepropResolver(); }; diff --git a/src/frontends/pytorch/src/transforms/rfftn_complex_replacer.hpp b/src/frontends/pytorch/src/transforms/rfftn_complex_replacer.hpp index 04ec53ab0f1561..5115e38bdf55b1 100644 --- a/src/frontends/pytorch/src/transforms/rfftn_complex_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/rfftn_complex_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class RFFTNComplexReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::RFFTNComplexReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::RFFTNComplexReplacer"); RFFTNComplexReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/string_equality_replacer.hpp b/src/frontends/pytorch/src/transforms/string_equality_replacer.hpp index 20dc3cc98b7f32..dfc826dfd600c0 100644 --- a/src/frontends/pytorch/src/transforms/string_equality_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/string_equality_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class StringEqualityReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::StringEqualityReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::StringEqualityReplacer"); StringEqualityReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.hpp b/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.hpp index 046a774e56ef8e..a77616b53813be 100644 --- a/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.hpp @@ -15,7 +15,7 @@ namespace pass { // This transformation replaces the GPTQ pattern with a Constant node class GPTQDecompressionReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::GPTQDecompressionReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::GPTQDecompressionReplacer"); GPTQDecompressionReplacer(); }; @@ -24,7 +24,7 @@ class GPTQDecompressionReplacer : public ov::pass::MatcherPass { // additional optimizations class GPTQMultPatternReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::GPTQMultPatternReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::GPTQMultPatternReplacer"); GPTQMultPatternReplacer(); }; diff --git a/src/frontends/pytorch/src/transforms/tuple_unpack_replacer.hpp b/src/frontends/pytorch/src/transforms/tuple_unpack_replacer.hpp index 8d737c3d15947d..625b986f3b64b7 100644 --- a/src/frontends/pytorch/src/transforms/tuple_unpack_replacer.hpp +++ b/src/frontends/pytorch/src/transforms/tuple_unpack_replacer.hpp @@ -14,7 +14,7 @@ namespace pass { class PrimTupleUnpackReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::PrimTupleUnpackReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::PrimTupleUnpackReplacer"); PrimTupleUnpackReplacer(); }; @@ -27,4 +27,4 @@ class TupleUnpackInBodyReplacer : public ov::pass::ModelPass { } // namespace pass } // namespace pytorch } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/u4_block_repack.hpp b/src/frontends/pytorch/src/transforms/u4_block_repack.hpp index 99742ff148813a..891fd93554f558 100644 --- a/src/frontends/pytorch/src/transforms/u4_block_repack.hpp +++ b/src/frontends/pytorch/src/transforms/u4_block_repack.hpp @@ -14,13 +14,13 @@ namespace pass { class U4BlockRepack : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::U4BlockRepack"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::U4BlockRepack"); U4BlockRepack(bool is_symmetrical = false); }; class U4ConvertReshape : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::pytorch::pass::U4ConvertReshape"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::pytorch::pass::U4ConvertReshape"); U4ConvertReshape(); }; diff --git a/src/frontends/pytorch/src/utils.cpp b/src/frontends/pytorch/src/utils.cpp index 752b9accb71d01..5cc7ec21f30911 100644 --- a/src/frontends/pytorch/src/utils.cpp +++ b/src/frontends/pytorch/src/utils.cpp @@ -42,7 +42,11 @@ using namespace ov::op; void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) { auto num_inputs = context.get_input_size(); - FRONT_END_OP_CONVERSION_CHECK(num_inputs >= min_inputs, "Got less inputs than expected"); + FRONT_END_OP_CONVERSION_CHECK(num_inputs >= min_inputs, + "Got less inputs ", + num_inputs, + " than expected ", + min_inputs); for (auto i = max_inputs; i < num_inputs; i++) { FRONT_END_OP_CONVERSION_CHECK(context.input_is_none(i), "Got more inputs than expected."); } diff --git a/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp b/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp index d506759fd33716..dab130cb381731 100644 --- a/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp +++ b/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp @@ -254,7 +254,7 @@ void CheckpointV1Reader::read_variable(const std::string& variable_name, ov::Any // This is only present at the first item of each checkpoint file and serves // as a table of contents, listing all the tensor slices saved in this file. - ::tensorflow::SavedTensorSlices sts; + ::tensorflow::SavedTensorSlices sts{}; FRONT_END_GENERAL_CHECK(sts.ParseFromArray(raw_data.data(), static_cast(raw_data.size())), "[TensorFlow Frontend] incorrect input checkpoint file or internal error: cannot parse " "SavedTensorSlices entry"); diff --git a/src/frontends/tensorflow/src/op/var_handle.cpp b/src/frontends/tensorflow/src/op/var_handle.cpp index f0077ae206bf6d..53fdf21d6086bf 100644 --- a/src/frontends/tensorflow/src/op/var_handle.cpp +++ b/src/frontends/tensorflow/src/op/var_handle.cpp @@ -98,7 +98,7 @@ OutputVector translate_varhandle_op(const NodeContext& node) { TENSORFLOW_OP_VALIDATION(node, result, "[TensorFlow Frontend] Internal error: Cannot find requested variable."); - ::tensorflow::BundleEntryProto entry; + ::tensorflow::BundleEntryProto entry{}; TENSORFLOW_OP_VALIDATION(node, entry.ParseFromArray(entry_data, static_cast(entry_size)), "[TensorFlow Frontend] Internal error: Cannot get read bundle entry."); diff --git a/src/frontends/tensorflow/src/op/xla_conv_v2.cpp b/src/frontends/tensorflow/src/op/xla_conv_v2.cpp index a01780d58cfeae..795f4deb3d93ef 100644 --- a/src/frontends/tensorflow/src/op/xla_conv_v2.cpp +++ b/src/frontends/tensorflow/src/op/xla_conv_v2.cpp @@ -111,7 +111,7 @@ OutputVector translate_xla_conv_v2_op(const NodeContext& node) { is_all_one, "[TensorFlow Frontend] internal error: convolutional kernel with holes is not supported"); - ConvolutionDimensionNumbers dimension_numbers; + ConvolutionDimensionNumbers dimension_numbers{}; TENSORFLOW_OP_VALIDATION( node, dimension_numbers.ParseFromArray(dimension_numbers_message.data(), diff --git a/src/frontends/tensorflow/src/transformations/uninitialized_variable_resolve.hpp b/src/frontends/tensorflow/src/transformations/uninitialized_variable_resolve.hpp index 5e0f8bd6dfdec0..30aadee2776b9e 100644 --- a/src/frontends/tensorflow/src/transformations/uninitialized_variable_resolve.hpp +++ b/src/frontends/tensorflow/src/transformations/uninitialized_variable_resolve.hpp @@ -19,7 +19,7 @@ namespace pass { // it borrows value of Variable that was used for some state (or node) in a graph class UninitializedVariableResolver : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::UninitializedVariableResolver"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::UninitializedVariableResolver"); UninitializedVariableResolver(); }; diff --git a/src/frontends/tensorflow/src/variables_index.cpp b/src/frontends/tensorflow/src/variables_index.cpp index 778f8b2f94bb7c..f4e330518e20c2 100644 --- a/src/frontends/tensorflow/src/variables_index.cpp +++ b/src/frontends/tensorflow/src/variables_index.cpp @@ -128,7 +128,7 @@ void VariablesIndex::read_bundle_header() { auto item = m_variables_index.find(""); FRONT_END_GENERAL_CHECK(item != m_variables_index.end(), "Bundle Header isn't found in index"); - ::tensorflow::BundleHeaderProto bundleHeader; + ::tensorflow::BundleHeaderProto bundleHeader{}; FRONT_END_GENERAL_CHECK(bundleHeader.ParseFromArray(item->second.data(), static_cast(item->second.size())), "Bundle Header: Cannot parse Bundle Header"); FRONT_END_GENERAL_CHECK(bundleHeader.version().producer() == 1, "Bundle Header: Unsupported producer version"); @@ -147,7 +147,7 @@ void VariablesIndex::read_checkpointable_object_graph() { return; } - ::tensorflow::BundleEntryProto entry; + ::tensorflow::BundleEntryProto entry{}; FRONT_END_GENERAL_CHECK(entry.ParseFromArray(item->second.data(), static_cast(item->second.size())), "CMO: Cannot parse Bundle Entry"); diff --git a/src/frontends/tensorflow_common/include/helper_transforms/embedding_segments_feature_fusing.hpp b/src/frontends/tensorflow_common/include/helper_transforms/embedding_segments_feature_fusing.hpp index ec2ce348cb5a1b..696242f321e733 100644 --- a/src/frontends/tensorflow_common/include/helper_transforms/embedding_segments_feature_fusing.hpp +++ b/src/frontends/tensorflow_common/include/helper_transforms/embedding_segments_feature_fusing.hpp @@ -20,7 +20,7 @@ namespace pass { // Such sub-graph is met in the Wide and Deep model in case of the SINGLE categorical feature. class EmbeddingSegmentSingleFeatureFusion : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::EmbeddingSegmentSingleFeatureFusion"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::EmbeddingSegmentSingleFeatureFusion"); EmbeddingSegmentSingleFeatureFusion(); }; diff --git a/src/frontends/tensorflow_common/include/helper_transforms/tensor_array_v3_replacer.hpp b/src/frontends/tensorflow_common/include/helper_transforms/tensor_array_v3_replacer.hpp index ad442e3e5dbe29..e0f7c20c11c9b9 100644 --- a/src/frontends/tensorflow_common/include/helper_transforms/tensor_array_v3_replacer.hpp +++ b/src/frontends/tensorflow_common/include/helper_transforms/tensor_array_v3_replacer.hpp @@ -19,7 +19,7 @@ namespace pass { // that simulates initial state of tensor array container class TensorArrayV3Replacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::TensorArrayV3Replacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::TensorArrayV3Replacer"); TensorArrayV3Replacer(); }; diff --git a/src/frontends/tensorflow_common/include/helper_transforms/tensor_list_ops_resolver.hpp b/src/frontends/tensorflow_common/include/helper_transforms/tensor_list_ops_resolver.hpp index 764b7dfc472d2a..cb587d7f665c7b 100644 --- a/src/frontends/tensorflow_common/include/helper_transforms/tensor_list_ops_resolver.hpp +++ b/src/frontends/tensorflow_common/include/helper_transforms/tensor_list_ops_resolver.hpp @@ -15,14 +15,14 @@ namespace pass { // Replace internal operation TensorListReserve with a sub-graph producing initial container class TensorListReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::TensorListReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::TensorListReplacer"); TensorListReplacer(); }; // Replace internal operation TensorListSetItem with a sub-graph that inserts a new tensor into container class TensorListSetItemReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::TensorListSetItemReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::TensorListSetItemReplacer"); TensorListSetItemReplacer(); }; @@ -30,14 +30,14 @@ class TensorListSetItemReplacer : public ov::pass::MatcherPass { // that inserts a new tensor into the tail of the container class TensorListPushBackReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::TensorListPushBackReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::TensorListPushBackReplacer"); TensorListPushBackReplacer(); }; // Replace internal operation TensorListGetItem with a sub-graph that gets a tensor from container by index class TensorListGetItemReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::TensorListGetItemReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::TensorListGetItemReplacer"); TensorListGetItemReplacer(); }; @@ -45,7 +45,7 @@ class TensorListGetItemReplacer : public ov::pass::MatcherPass { // Replace TensorListSetItem and TensorListGetItem with ConcatOutput and SlicedInput class TensorListInLoopOptimization : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow::pass::TensorListInLoopOptimization"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow::pass::TensorListInLoopOptimization"); TensorListInLoopOptimization(); }; diff --git a/src/frontends/tensorflow_lite/src/tflite_transformations/rfft2d_complex_abs.h b/src/frontends/tensorflow_lite/src/tflite_transformations/rfft2d_complex_abs.h index f8599e2c7791a3..11e79cfe09a58c 100644 --- a/src/frontends/tensorflow_lite/src/tflite_transformations/rfft2d_complex_abs.h +++ b/src/frontends/tensorflow_lite/src/tflite_transformations/rfft2d_complex_abs.h @@ -24,7 +24,7 @@ namespace pass { // \-(imag)-> Unsqueeze -> Reshape -> Square / class Rfft2dSimplifier : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow_lite::pass::Rfft2dSimplifier"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow_lite::pass::Rfft2dSimplifier"); Rfft2dSimplifier(); }; diff --git a/src/frontends/tensorflow_lite/src/tflite_transformations/tflite_quantize_resolver.hpp b/src/frontends/tensorflow_lite/src/tflite_transformations/tflite_quantize_resolver.hpp index 45fd3e70722d54..584e8c55b6a9ea 100644 --- a/src/frontends/tensorflow_lite/src/tflite_transformations/tflite_quantize_resolver.hpp +++ b/src/frontends/tensorflow_lite/src/tflite_transformations/tflite_quantize_resolver.hpp @@ -18,14 +18,14 @@ namespace pass { // Fuses Convert into TFLQuantize operation class TFLQuantizeConvert : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow_lite::pass::TFLQuantizeConvert"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow_lite::pass::TFLQuantizeConvert"); TFLQuantizeConvert(); }; // Replaces TFLQuantize operation with FQ or sub-mul pattern if necessary class TFLQuantizeReplacer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ov::frontend::tensorflow_lite::pass::TFLQuantizeReplacer"); + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::tensorflow_lite::pass::TFLQuantizeReplacer"); TFLQuantizeReplacer(); }; diff --git a/src/frontends/tests/frontend/shared/src/conversion.cpp b/src/frontends/tests/frontend/shared/src/conversion.cpp index 34e4f2fd62719a..058d5534965436 100644 --- a/src/frontends/tests/frontend/shared/src/conversion.cpp +++ b/src/frontends/tests/frontend/shared/src/conversion.cpp @@ -95,3 +95,17 @@ TEST_P(FrontEndConversionExtensionTest, TestConversionExtensionViaSO) { OV_ASSERT_NO_THROW(model = frontend->convert(input_model)); ASSERT_NE(model, nullptr); } + +#ifdef OPENVINO_CPP_VER_17 +TEST_P(FrontEndConversionExtensionTest, TestConversionExtensionViaSOByPath) { + auto frontend = m_param.m_frontend; + const std::filesystem::path lib_path = get_lib_path("test_builtin_extensions"); + frontend->add_extension(lib_path); + std::shared_ptr input_model; + OV_ASSERT_NO_THROW(input_model = frontend->load(m_param.m_modelName)); + ASSERT_NE(input_model, nullptr); + std::shared_ptr model; + OV_ASSERT_NO_THROW(model = frontend->convert(input_model)); + ASSERT_NE(model, nullptr); +} +#endif diff --git a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp index 01f7b556da909f..3a3d5d9910305f 100644 --- a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp +++ b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp @@ -136,11 +136,11 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< /** * @brief Release intermediate memory - * + * */ virtual void release_memory(); - virtual ~ICompiledModel() = default; + virtual ~ICompiledModel(); private: std::shared_ptr m_plugin; diff --git a/src/inference/dev_api/openvino/runtime/icore.hpp b/src/inference/dev_api/openvino/runtime/icore.hpp index 659b9c5c0f5788..cc2c94e724ab41 100644 --- a/src/inference/dev_api/openvino/runtime/icore.hpp +++ b/src/inference/dev_api/openvino/runtime/icore.hpp @@ -60,9 +60,12 @@ class OPENVINO_RUNTIME_API ICore { * @param model_path path to IR file * @param bin_path path to bin file, if path is empty, will try to read bin file with the same name as xml and * if bin file with the same name was not found, will load IR without weights. + * @param properties Optional map of pairs: (property name, property value) relevant only for this read operation. * @return shared pointer to ov::Model */ - virtual std::shared_ptr read_model(const std::string& model_path, const std::string& bin_path) const = 0; + virtual std::shared_ptr read_model(const std::string& model_path, + const std::string& bin_path, + const AnyMap& properties) const = 0; virtual ov::AnyMap create_compile_config(const std::string& device_name, const ov::AnyMap& origConfig) const = 0; diff --git a/src/inference/dev_api/openvino/runtime/internal_properties.hpp b/src/inference/dev_api/openvino/runtime/internal_properties.hpp index 60d6b66cfda897..bec304104581ac 100644 --- a/src/inference/dev_api/openvino/runtime/internal_properties.hpp +++ b/src/inference/dev_api/openvino/runtime/internal_properties.hpp @@ -9,6 +9,7 @@ #pragma once +#include "openvino/runtime/aligned_buffer.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" @@ -36,6 +37,12 @@ static constexpr Property, PropertyMutability::RO> cac */ static constexpr Property caching_with_mmap{"CACHING_WITH_MMAP"}; +/** + * @brief Property to get a ov::AlignedBuffer with cached model + * @ingroup ov_dev_api_plugin_api + */ +static constexpr Property, PropertyMutability::RW> cached_model_buffer{"CACHED_MODEL_BUFFER"}; + /** * @brief Allow to create exclusive_async_requests with one executor * @ingroup ov_dev_api_plugin_api diff --git a/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp b/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp index bbe81ee1c9602d..b80bf32958e4ac 100644 --- a/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp +++ b/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp @@ -15,6 +15,7 @@ #include #include "openvino/core/descriptor/tensor.hpp" +#include "openvino/core/descriptor_tensor.hpp" #include "openvino/runtime/common.hpp" #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/profiling_info.hpp" @@ -162,7 +163,11 @@ class OPENVINO_RUNTIME_API ISyncInferRequest : public IInferRequest { private: std::shared_ptr m_compiled_model; // Mutable to return reference to ov::Tensor - mutable std::unordered_map, ov::SoPtr> m_tensors; + mutable std::unordered_map, + ov::SoPtr, + descriptor::TensorExtension::Hasher, + descriptor::TensorExtension::Equal> + m_tensors; // Cache ports mutable std::unordered_map m_cached_ports; mutable std::mutex m_cache_mutex; diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp index 59d56dfdd49d73..bebc2014ab8028 100644 --- a/src/inference/dev_api/openvino/runtime/system_conf.hpp +++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp @@ -83,6 +83,13 @@ OPENVINO_RUNTIME_API bool with_cpu_x86_sse42(); */ OPENVINO_RUNTIME_API bool with_cpu_neon_fp16(); +/** + * @brief Checks whether CPU supports ARM SVE capability + * @ingroup ov_dev_api_system_conf + * @return `True` if ARM SVE instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_sve(); + /** * @brief Checks whether CPU supports AVX capability * @ingroup ov_dev_api_system_conf diff --git a/src/inference/include/openvino/runtime/core.hpp b/src/inference/include/openvino/runtime/core.hpp index f0ba27c1cf5daa..2ca6dc83bcf726 100644 --- a/src/inference/include/openvino/runtime/core.hpp +++ b/src/inference/include/openvino/runtime/core.hpp @@ -25,6 +25,10 @@ #include "openvino/runtime/remote_context.hpp" #include "openvino/runtime/tensor.hpp" +#ifdef OPENVINO_CPP_VER_17 +# include +#endif + namespace ov { /** @@ -75,11 +79,14 @@ class OPENVINO_RUNTIME_API Core { * For the following file formats the `bin_path` parameter is not used: * * ONNX format (*.onnx) * * PDPD (*.pdmodel) - * * TF (*.pb) + * * TF (*.pb, *.meta, SavedModel directory) * * TFLite (*.tflite) + * @param properties Optional map of pairs: (property name, property value) relevant only for this read operation. * @return A model. */ - std::shared_ptr read_model(const std::wstring& model_path, const std::wstring& bin_path = {}) const; + std::shared_ptr read_model(const std::wstring& model_path, + const std::wstring& bin_path = {}, + const ov::AnyMap& properties = {}) const; #endif /** @@ -92,11 +99,57 @@ class OPENVINO_RUNTIME_API Core { * For the following file formats the `bin_path` parameter is not used: * * ONNX format (*.onnx) * * PDPD (*.pdmodel) - * * TF (*.pb) + * * TF (*.pb, *.meta, SavedModel directory) * * TFLite (*.tflite) + * @param properties Optional map of pairs: (property name, property value) relevant only for this read operation. * @return A model. + * @{ + */ + std::shared_ptr read_model(const std::string& model_path, + const std::string& bin_path = {}, + const ov::AnyMap& properties = {}) const; + +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto read_model(const Path& model_path, const Path& bin_path = {}, const ov::AnyMap& properties = {}) const { + return read_model(model_path.string(), bin_path.string(), properties); + } +#endif + /// @} + + /** + * @brief Reads models from IR / ONNX / PDPD / TF / TFLite file formats. + * + * @param model_path Path to a model. + * @param bin_path Path to a data file. + * For IR format (*.bin): + * * if `bin_path` is empty, will try to read a bin file with the same name as xml and + * * if the bin file with the same name is not found, will load IR without weights. + * For the following file formats the `bin_path` parameter is not used: + * * ONNX format (*.onnx) + * * PDPD (*.pdmodel) + * * TF (*.pb, *.meta, SavedModel directory) + * * TFLite (*.tflite) + * @param properties Optional pack of pairs: (property name, property value) relevant only for this read operation. + * @return A model. + * @{ */ - std::shared_ptr read_model(const std::string& model_path, const std::string& bin_path = {}) const; + template + util::EnableIfAllStringAny read_model(const std::string& model_path, + const std::string& bin_path, + Properties&&... properties) const { + return read_model(model_path, bin_path, AnyMap{std::forward(properties)...}); + } + +#ifdef OPENVINO_CPP_VER_17 + template && (sizeof...(Properties) > 0)>* = nullptr> + auto read_model(const Path& model_path, const Path& bin_path, Properties&&... properties) const { + return read_model(model_path.string(), bin_path.string(), std::forward(properties)...); + } +#endif + /// @} /** * @brief Reads models from IR / ONNX / PDPD / TF / TFLite formats. @@ -197,6 +250,13 @@ class OPENVINO_RUNTIME_API Core { */ CompiledModel compile_model(const std::string& model_path, const AnyMap& properties = {}); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const AnyMap& properties = {}) const { + return compile_model(model_path.string(), properties); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT CompiledModel compile_model(const std::wstring& model_path, const AnyMap& properties = {}); #endif @@ -223,6 +283,13 @@ class OPENVINO_RUNTIME_API Core { return compile_model(model_path, AnyMap{std::forward(properties)...}); } +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, Properties&&... properties) { + return compile_model(model_path.string(), std::forward(properties)...); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT template util::EnableIfAllStringAny compile_model(const std::wstring& model_path, @@ -250,6 +317,13 @@ class OPENVINO_RUNTIME_API Core { const std::string& device_name, const AnyMap& properties = {}); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const std::string& device_name, const AnyMap& properties = {}) { + return compile_model(model_path.string(), device_name, properties); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT CompiledModel compile_model(const std::wstring& model_path, const std::string& device_name, @@ -279,6 +353,13 @@ class OPENVINO_RUNTIME_API Core { return compile_model(model_path, device_name, AnyMap{std::forward(properties)...}); } +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const std::string& device_name, Properties&&... properties) { + return compile_model(model_path.string(), device_name, std::forward(properties)...); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT template util::EnableIfAllStringAny compile_model(const std::wstring& model_path, @@ -359,9 +440,18 @@ class OPENVINO_RUNTIME_API Core { /** * @brief Registers an extension to a Core object. * @param library_path Path to the library with ov::Extension. + * @{ */ void add_extension(const std::string& library_path); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + void add_extension(const Path& model_path) { + add_extension(model_path.string()); + } +#endif + /// @} + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT /** * @brief Registers an extension to a Core object. diff --git a/src/inference/include/openvino/runtime/intel_npu/properties.hpp b/src/inference/include/openvino/runtime/intel_npu/properties.hpp index 49416f61b8b43b..8734757da1d53d 100644 --- a/src/inference/include/openvino/runtime/intel_npu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_npu/properties.hpp @@ -95,5 +95,12 @@ static constexpr ov::Property max_tiles{"NPU_MAX_TILES"}; */ static constexpr ov::Property bypass_umd_caching{"NPU_BYPASS_UMD_CACHING"}; +/** + * @brief [Only for NPU Plugin] + * Type: boolean, default is false + * This option allows to delay loading the weights until inference is created + */ +static constexpr ov::Property defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"}; + } // namespace intel_npu } // namespace ov diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index 5674c75dd546d7..8baea3ed408656 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -801,6 +801,8 @@ struct EncryptionCallbacks { * when loading from the cache. This property is set in core.compile_model only. * - First value of the struct is encryption function. * - Second value of the struct is decryption function. + * @note GPU Plugin: encrypts whole blob, not only model structure. Only used when ov::cache_mode property is set to + * "OPTIMIZE_SIZE". * @ingroup ov_runtime_cpp_prop_api */ static constexpr Property cache_encryption_callbacks{ diff --git a/src/inference/src/cache_manager.hpp b/src/inference/src/cache_manager.hpp index c441811c3cfd02..82813e5dd4788f 100644 --- a/src/inference/src/cache_manager.hpp +++ b/src/inference/src/cache_manager.hpp @@ -69,7 +69,7 @@ class ICacheManager { /** * @brief Function passing created input stream */ - using StreamReader = std::function; + using StreamReader = std::function)>; /** * @brief Callback when OpenVINO intends to read model from cache @@ -143,10 +143,10 @@ class FileStorageCacheManager final : public ICacheManager { std::make_shared>>(mmap->data(), mmap->size(), mmap); OwningSharedStreamBuffer buf(shared_buffer); std::istream stream(&buf); - reader(stream); + reader(stream, shared_buffer); } else { std::ifstream stream(blob_file_name, std::ios_base::binary); - reader(stream); + reader(stream, nullptr); } } } diff --git a/src/inference/src/cpp/compiled_model.cpp b/src/inference/src/cpp/compiled_model.cpp index d675cba4714887..c780bbee1e991d 100644 --- a/src/inference/src/cpp/compiled_model.cpp +++ b/src/inference/src/cpp/compiled_model.cpp @@ -8,10 +8,6 @@ #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/properties.hpp" -#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) -# include -#endif - #define OV_COMPILED_MODEL_CALL_STATEMENT(...) \ if (_impl == nullptr) \ OPENVINO_THROW("CompiledModel was not initialized."); \ @@ -27,12 +23,6 @@ namespace ov { CompiledModel::~CompiledModel() { _impl = {}; -#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) - // Linux memory margent doesn't return system memory immediate after release. - // It depends on memory chunk size and allocation history. - // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process. - malloc_trim(0); -#endif } CompiledModel::CompiledModel(const std::shared_ptr& impl, const std::shared_ptr& so) diff --git a/src/inference/src/cpp/core.cpp b/src/inference/src/cpp/core.cpp index 2d6c204757bcf6..5d85fe81364a17 100644 --- a/src/inference/src/cpp/core.cpp +++ b/src/inference/src/cpp/core.cpp @@ -80,14 +80,19 @@ Core::Core(const std::string& xml_config_file) { std::map Core::get_versions(const std::string& device_name) const { OV_CORE_CALL_STATEMENT({ return _impl->get_versions(device_name); })} #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT -std::shared_ptr Core::read_model(const std::wstring& model_path, const std::wstring& bin_path) const { - OV_CORE_CALL_STATEMENT( - return _impl->read_model(ov::util::wstring_to_string(model_path), ov::util::wstring_to_string(bin_path));); +std::shared_ptr Core::read_model(const std::wstring& model_path, + const std::wstring& bin_path, + const ov::AnyMap& properties) const { + OV_CORE_CALL_STATEMENT(return _impl->read_model(ov::util::wstring_to_string(model_path), + ov::util::wstring_to_string(bin_path), + properties);); } #endif -std::shared_ptr Core::read_model(const std::string& model_path, const std::string& bin_path) const { - OV_CORE_CALL_STATEMENT(return _impl->read_model(model_path, bin_path);); +std::shared_ptr Core::read_model(const std::string& model_path, + const std::string& bin_path, + const AnyMap& properties) const { + OV_CORE_CALL_STATEMENT(return _impl->read_model(model_path, bin_path, properties);); } std::shared_ptr Core::read_model(const std::string& model, const ov::Tensor& weights) const { diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index 244d27b5eebb67..e0e2fb109dc642 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -38,6 +38,18 @@ ov::ICore::~ICore() = default; +namespace ov { +namespace util { +template +constexpr std::array< + typename std::conditional::value, typename std::common_type::type, T>::type, + sizeof...(Args)> +make_array(Args&&... args) { + return {std::forward(args)...}; +} +} // namespace util +} // namespace ov + namespace { #ifdef PROXY_PLUGIN_ENABLED @@ -205,6 +217,12 @@ void clean_batch_properties(const std::string& deviceName, ov::AnyMap& config, c } } } + +static const auto core_properties_names = + ov::util::make_array(ov::cache_dir.name(), ov::enable_mmap.name(), ov::force_tbb_terminate.name()); + +static const auto auto_batch_properties_names = + ov::util::make_array(ov::auto_batch_timeout.name(), ov::hint::allow_auto_batching.name()); } // namespace bool ov::is_config_applicable(const std::string& user_device_name, const std::string& subprop_device_name) { @@ -239,22 +257,21 @@ bool ov::is_config_applicable(const std::string& user_device_name, const std::st return false; } -ov::Parsed ov::parseDeviceNameIntoConfig(const std::string& deviceName, - const AnyMap& config, - const bool keep_core_property) { +namespace { +ov::Parsed parse_device_config(const std::string& device_name, + const ov::CoreConfig& core_config, + const ov::AnyMap& properties, + const bool keep_auto_batch_property) { // check to the validity of device name - auto bracket_pos = deviceName.find(")"); + auto bracket_pos = device_name.find(")"); while (bracket_pos != std::string::npos) { - if (bracket_pos < deviceName.length() - 1 && - (deviceName[bracket_pos + 1] != ',' || bracket_pos + 1 == deviceName.length() - 1)) { - OPENVINO_THROW("Device with \"", deviceName, "\" name is illegal in the OpenVINO Runtime"); + if (bracket_pos < device_name.length() - 1 && + (device_name[bracket_pos + 1] != ',' || bracket_pos + 1 == device_name.length() - 1)) { + OPENVINO_THROW("Device with \"", device_name, "\" name is illegal in the OpenVINO Runtime"); } - bracket_pos = deviceName.find(")", bracket_pos + 1); + bracket_pos = device_name.find(")", bracket_pos + 1); } - auto updated_config = config; - auto updated_device_name = deviceName; - /** Note: auto-batching is already applied by this time, so the call: * core.compile_model("GPU", ov::device::properties("BATCH", ov::auto_batch_timeout(400))); * is transformed and we have here: @@ -268,17 +285,19 @@ ov::Parsed ov::parseDeviceNameIntoConfig(const std::string& deviceName, * So, if one day, we want to add more options in form of ov::allow_, we need to apply it before * 'flatten_sub_properties' call to have proper behavior */ + ov::Parsed parsed{device_name, flatten_sub_properties(device_name, properties), core_config}; + auto& updated_device_name = parsed._deviceName; + auto& updated_config = parsed._config; - updated_config = flatten_sub_properties(deviceName, updated_config); std::string parsed_device_priority; // try to find ':' to extract name of virtual device - auto pos = deviceName.find_first_of(':'); + auto pos = device_name.find_first_of(':'); if (pos != std::string::npos) { - updated_device_name = deviceName.substr(0, pos); - parsed_device_priority = deviceName.substr(pos + 1); + updated_device_name = device_name.substr(0, pos); + parsed_device_priority = device_name.substr(pos + 1); } else { - ov::DeviceIDParser parser(deviceName); + ov::DeviceIDParser parser(device_name); updated_device_name = parser.get_device_name(); parsed_device_priority = parser.get_device_id(); } @@ -295,20 +314,42 @@ ov::Parsed ov::parseDeviceNameIntoConfig(const std::string& deviceName, OPENVINO_THROW("Device priority / ID mismatch: ", parsed_device_priority, " (from ", - deviceName, + device_name, ") vs ", it->second.as(), " (from config)"); } }; + parsed._core_config.set(updated_config); // keep batch property only when called from query_supported_property - if (!keep_core_property) { - clean_batch_properties(updated_device_name, updated_config, ov::hint::allow_auto_batching); - clean_batch_properties(updated_device_name, updated_config, ov::auto_batch_timeout); + if (!keep_auto_batch_property) { + for (const auto& name : auto_batch_properties_names) { + clean_batch_properties(updated_device_name, updated_config, name); + } } + return parsed; +} +} // namespace - return {std::move(updated_device_name), std::move(updated_config)}; +ov::Parsed ov::parseDeviceNameIntoConfig(const std::string& deviceName, + const AnyMap& config, + const bool keep_auto_batch_property) { + return parseDeviceNameIntoConfig(deviceName, CoreConfig{}, config, keep_auto_batch_property); +} + +ov::Parsed ov::parseDeviceNameIntoConfig(const std::string& deviceName, + const CoreConfig& coreConfig, + const AnyMap& config, + const bool keep_auto_batch_property) { + auto parsed = parse_device_config(deviceName, coreConfig, config, keep_auto_batch_property); + + // remove core properties for HW devices + if (!is_virtual_device(parsed._deviceName)) { + // note: ov::cache_dir kept as plugin may require it + CoreConfig::remove_core_skip_cache_dir(parsed._config); + } + return parsed; } ov::CoreImpl::CoreImpl() { @@ -663,8 +704,7 @@ ov::Plugin ov::CoreImpl::get_plugin(const std::string& pluginName) const { { OPENVINO_SUPPRESS_DEPRECATED_START if (device_supports_cache_dir(plugin)) { - ov::AnyMap empty_map; - auto cacheConfig = coreConfig.get_cache_config_for_device(plugin, empty_map); + auto cacheConfig = coreConfig.get_cache_config_for_device(plugin); if (cacheConfig._cacheManager) { desc.defaultConfig[ov::cache_dir.name()] = cacheConfig._cacheDir; } @@ -737,13 +777,14 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - auto parsed = parseDeviceNameIntoConfig(deviceName, config_with_batch, is_proxy_device(device_name)); + auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); ov::SoPtr res; - auto cacheManager = coreConfig.get_cache_config_for_device(plugin, parsed._config)._cacheManager; + // will consume ov::cache_dir if plugin not support it + auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; // Skip caching for proxy plugin. HW plugin will load network from the cache if (cacheManager && device_supports_model_caching(plugin) && !is_proxy_device(plugin)) { - CacheContent cacheContent{cacheManager}; + CacheContent cacheContent{cacheManager, parsed._core_config.get_enable_mmap()}; cacheContent.blobId = ov::ModelCache::compute_hash(model, create_compile_config(plugin, parsed._config)); std::unique_ptr lock = cacheGuard.get_hash_lock(cacheContent.blobId); res = load_model_from_cache(cacheContent, plugin, parsed._config, ov::SoPtr{}, [&]() { @@ -770,13 +811,14 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - auto parsed = parseDeviceNameIntoConfig(deviceName, config_with_batch, is_proxy_device(deviceName)); + auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); ov::SoPtr res; - auto cacheManager = coreConfig.get_cache_config_for_device(plugin, parsed._config)._cacheManager; + // will consume ov::cache_dir if plugin not support it + auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; // Skip caching for proxy plugin. HW plugin will load network from the cache if (cacheManager && device_supports_model_caching(plugin) && !is_proxy_device(plugin)) { - CacheContent cacheContent{cacheManager}; + CacheContent cacheContent{cacheManager, parsed._core_config.get_enable_mmap()}; cacheContent.blobId = ov::ModelCache::compute_hash(model, create_compile_config(plugin, parsed._config)); std::unique_ptr lock = cacheGuard.get_hash_lock(cacheContent.blobId); res = load_model_from_cache(cacheContent, plugin, parsed._config, context, [&]() { @@ -792,21 +834,22 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::string& mod const std::string& device_name, const ov::AnyMap& config) const { OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::LoadTime, "Core::compile_model::Path"); - auto parsed = parseDeviceNameIntoConfig(device_name, config); + auto parsed = parse_device_config(device_name, coreConfig, config, false); // in case of compile_model(file_name), we need to clear-up core-level properties auto plugin = get_plugin(parsed._deviceName); ov::SoPtr compiled_model; - - auto cacheManager = coreConfig.get_cache_config_for_device(plugin, parsed._config)._cacheManager; + // will consume ov::cache_dir if plugin not support it + auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; if (cacheManager && device_supports_model_caching(plugin) && !is_proxy_device(plugin)) { // Skip caching for proxy plugin. HW plugin will load network from the cache - CacheContent cacheContent{cacheManager, model_path}; + CoreConfig::remove_core_skip_cache_dir(parsed._config); + CacheContent cacheContent{cacheManager, parsed._core_config.get_enable_mmap(), model_path}; cacheContent.blobId = ov::ModelCache::compute_hash(model_path, create_compile_config(plugin, parsed._config)); std::unique_ptr lock = cacheGuard.get_hash_lock(cacheContent.blobId); compiled_model = load_model_from_cache(cacheContent, plugin, parsed._config, ov::SoPtr{}, [&]() { - auto model = read_model(model_path, std::string{}); + const auto model = util::read_model(model_path, "", extensions, parsed._core_config.get_enable_mmap()); return compile_model_and_cache(plugin, model, parsed._config, {}, cacheContent); }); } else { @@ -820,15 +863,14 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::string& mod const std::string& device_name, const ov::AnyMap& config) const { OV_ITT_SCOPED_TASK(ov::itt::domains::OV, "Core::compile_model::from_memory"); - auto parsed = parseDeviceNameIntoConfig(device_name, config); - // in case of compile_model(file_name), we need to clear-up core-level properties + auto parsed = parseDeviceNameIntoConfig(device_name, coreConfig, config); auto plugin = get_plugin(parsed._deviceName); ov::SoPtr compiled_model; - - auto cacheManager = coreConfig.get_cache_config_for_device(plugin, parsed._config)._cacheManager; + // will consume ov::cache_dir if plugin not support it + auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; // Skip caching for proxy plugin. HW plugin will load network from the cache if (cacheManager && device_supports_model_caching(plugin) && !is_proxy_device(plugin)) { - CacheContent cacheContent{cacheManager}; + CacheContent cacheContent{cacheManager, parsed._core_config.get_enable_mmap()}; cacheContent.blobId = ov::ModelCache::compute_hash(model_str, weights, create_compile_config(plugin, parsed._config)); std::unique_ptr lock = cacheGuard.get_hash_lock(cacheContent.blobId); @@ -948,7 +990,7 @@ ov::AnyMap ov::CoreImpl::get_supported_property(const std::string& full_device_n // ov::device::priority cannot be shared, because it's specific for current virtual // plugin. So, we need to remove ov::device::priorities from the list, because it's // supposed to be set for current virtual plugin and cannot be propagated down - ov::AnyMap return_properties = user_properties; + auto return_properties = user_properties; auto device_priorities_it = return_properties.find(ov::device::priorities.name()); if (device_priorities_it != return_properties.end()) { return_properties.erase(device_priorities_it); @@ -957,30 +999,24 @@ ov::AnyMap ov::CoreImpl::get_supported_property(const std::string& full_device_n return return_properties; } - static const std::vector core_level_properties = { - ov::cache_dir.name(), - ov::force_tbb_terminate.name(), - // auto-batch properties are also treated as core-level - ov::auto_batch_timeout.name(), - ov::hint::allow_auto_batching.name(), - }; - - const auto flattened = ov::parseDeviceNameIntoConfig(full_device_name, user_properties, true); - const std::string& device_name = flattened._deviceName; + const auto flattened = parse_device_config(full_device_name, {}, user_properties, keep_core_property); const auto& flattened_config = flattened._config; + const auto& device_name = flattened._deviceName; // virtual plugins should bypass core-level properties to HW plugins // so, we need to report them as supported std::vector supported_config_keys; + auto key_inserter = std::back_inserter(supported_config_keys); if (keep_core_property) { - supported_config_keys = core_level_properties; + key_inserter = std::copy(core_properties_names.begin(), core_properties_names.end(), key_inserter); + key_inserter = std::copy(auto_batch_properties_names.begin(), auto_batch_properties_names.end(), key_inserter); } // try to search against OV API 2.0' mutable supported_properties try { for (auto&& property : ICore::get_property(device_name, ov::supported_properties, {})) { if (property.is_mutable()) { - supported_config_keys.emplace_back(std::move(property)); + *key_inserter = std::move(property); } } } catch (ov::Exception&) { @@ -990,7 +1026,7 @@ ov::AnyMap ov::CoreImpl::get_supported_property(const std::string& full_device_n try { for (auto&& property : ICore::get_property(device_name, ov::internal::supported_properties, {})) { if (property.is_mutable()) { - supported_config_keys.emplace_back(std::move(property)); + *key_inserter = std::move(property); } } } catch (ov::Exception&) { @@ -1160,8 +1196,7 @@ ov::Any ov::CoreImpl::get_property(const std::string& device_name, if (parsed._deviceName.empty()) { return get_property_for_core(name); } else if (name == ov::cache_dir.name()) { - ov::AnyMap empty_map; - return coreConfig.get_cache_config_for_device(get_plugin(parsed._deviceName), empty_map)._cacheDir; + return coreConfig.get_cache_config_for_device(get_plugin(parsed._deviceName))._cacheDir; } return get_plugin(parsed._deviceName).get_property(name, parsed._config); } @@ -1299,9 +1334,7 @@ void ov::CoreImpl::set_property_for_device(const ov::AnyMap& configMap, const st { OPENVINO_SUPPRESS_DEPRECATED_START if (device_supports_cache_dir(plugin.second)) { - ov::AnyMap empty_map = {}; - configCopy[ov::cache_dir.name()] = - coreConfig.get_cache_config_for_device(plugin.second, empty_map)._cacheDir; + configCopy[ov::cache_dir.name()] = coreConfig.get_cache_config_for_device(plugin.second)._cacheDir; } else if (configCopy.count(ov::cache_dir.name()) > 0) { // Remove "CACHE_DIR" from config if it is not supported by plugin configCopy.erase(ov::cache_dir.name()); @@ -1411,9 +1444,9 @@ ov::SoPtr ov::CoreImpl::load_model_from_cache( try { cacheContent.cacheManager->read_cache_entry( cacheContent.blobId, - coreConfig.get_enable_mmap() && ov::util::contains(plugin.get_property(ov::internal::supported_properties), - ov::internal::caching_with_mmap), - [&](std::istream& networkStream) { + cacheContent.mmap_enabled && ov::util::contains(plugin.get_property(ov::internal::supported_properties), + ov::internal::caching_with_mmap), + [&](std::istream& networkStream, std::shared_ptr model_buffer) { OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::LoadTime, "Core::load_model_from_cache::ReadStreamAndImport"); @@ -1459,6 +1492,9 @@ ov::SoPtr ov::CoreImpl::load_model_from_cache( update_config[ov::weights_path.name()] = weights_path; } } + if (model_buffer) { + update_config[ov::internal::cached_model_buffer.name()] = model_buffer; + } compiled_model = context ? plugin.import_model(networkStream, context, update_config) : plugin.import_model(networkStream, update_config); }); @@ -1513,7 +1549,16 @@ ov::AnyMap ov::CoreImpl::create_compile_config(const ov::Plugin& plugin, const o return compile_config; } -void ov::CoreImpl::CoreConfig::set_and_update(ov::AnyMap& config) { +ov::CoreConfig::CoreConfig(const CoreConfig& other) { + { + std::lock_guard lock(other._cacheConfigMutex); + _cacheConfig = other._cacheConfig; + _cacheConfigPerDevice = other._cacheConfigPerDevice; + } + _flag_enable_mmap = other._flag_enable_mmap; +} + +void ov::CoreConfig::set(const ov::AnyMap& config) { auto it = config.find(ov::cache_dir.name()); if (it != config.end()) { std::lock_guard lock(_cacheConfigMutex); @@ -1523,43 +1568,56 @@ void ov::CoreImpl::CoreConfig::set_and_update(ov::AnyMap& config) { for (auto& deviceCfg : _cacheConfigPerDevice) { deviceCfg.second = CoreConfig::CacheConfig::create(it->second.as()); } - config.erase(it); } it = config.find(ov::force_tbb_terminate.name()); if (it != config.end()) { auto flag = it->second.as(); ov::threading::executor_manager()->set_property({{it->first, flag}}); - config.erase(it); } it = config.find(ov::enable_mmap.name()); if (it != config.end()) { auto flag = it->second.as(); _flag_enable_mmap = flag; - config.erase(it); } } -void ov::CoreImpl::CoreConfig::set_cache_dir_for_device(const std::string& dir, const std::string& name) { +void ov::CoreConfig::set_and_update(ov::AnyMap& config) { + set(config); + remove_core(config); +} + +void ov::CoreConfig::remove_core(ov::AnyMap& config) { + for (const auto& name : core_properties_names) { + config.erase(name); + } +} + +void ov::CoreConfig::remove_core_skip_cache_dir(ov::AnyMap& config) { + for (const auto& name : {ov::enable_mmap.name(), ov::force_tbb_terminate.name()}) { + config.erase(name); + } +} + +void ov::CoreConfig::set_cache_dir_for_device(const std::string& dir, const std::string& name) { std::lock_guard lock(_cacheConfigMutex); _cacheConfigPerDevice[name] = CoreConfig::CacheConfig::create(dir); } -std::string ov::CoreImpl::CoreConfig::get_cache_dir() const { +std::string ov::CoreConfig::get_cache_dir() const { std::lock_guard lock(_cacheConfigMutex); return _cacheConfig._cacheDir; } -bool ov::CoreImpl::CoreConfig::get_enable_mmap() const { +bool ov::CoreConfig::get_enable_mmap() const { return _flag_enable_mmap; } // Creating thread-safe copy of config including shared_ptr to ICacheManager // Passing empty or not-existing name will return global cache config -ov::CoreImpl::CoreConfig::CacheConfig ov::CoreImpl::CoreConfig::get_cache_config_for_device( - const ov::Plugin& plugin, - ov::AnyMap& parsedConfig) const { +ov::CoreConfig::CacheConfig ov::CoreConfig::get_cache_config_for_device(const ov::Plugin& plugin, + ov::AnyMap& parsedConfig) const { // cache_dir is enabled locally in compile_model only if (parsedConfig.count(ov::cache_dir.name())) { const auto& cache_dir_val = parsedConfig.at(ov::cache_dir.name()).as(); @@ -1572,16 +1630,16 @@ ov::CoreImpl::CoreConfig::CacheConfig ov::CoreImpl::CoreConfig::get_cache_config } return tempConfig; } else { // cache_dir is set to Core globally or for the specific device - std::lock_guard lock(_cacheConfigMutex); - if (_cacheConfigPerDevice.count(plugin.get_name()) > 0) { - return _cacheConfigPerDevice.at(plugin.get_name()); - } else { - return _cacheConfig; - } + return get_cache_config_for_device(plugin); } } -ov::CoreImpl::CoreConfig::CacheConfig ov::CoreImpl::CoreConfig::CacheConfig::create(const std::string& dir) { +ov::CoreConfig::CacheConfig ov::CoreConfig::get_cache_config_for_device(const ov::Plugin& plugin) const { + std::lock_guard lock(_cacheConfigMutex); + return _cacheConfigPerDevice.count(plugin.get_name()) ? _cacheConfigPerDevice.at(plugin.get_name()) : _cacheConfig; +} + +ov::CoreConfig::CacheConfig ov::CoreConfig::CacheConfig::create(const std::string& dir) { std::shared_ptr cache_manager = nullptr; if (!dir.empty()) { @@ -1610,9 +1668,13 @@ void ov::CoreImpl::add_mutex(const std::string& dev_name) { dev_mutexes[dev_name]; } -std::shared_ptr ov::CoreImpl::read_model(const std::string& modelPath, const std::string& binPath) const { +std::shared_ptr ov::CoreImpl::read_model(const std::string& modelPath, + const std::string& binPath, + const AnyMap& properties) const { OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::ReadTime, "CoreImpl::read_model from file"); - return ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap()); + auto local_core_config = coreConfig; + local_core_config.set(properties); + return ov::util::read_model(modelPath, binPath, extensions, local_core_config.get_enable_mmap()); } std::shared_ptr ov::CoreImpl::read_model(const std::string& model, diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 7cf12f3ba3280c..85417175c22556 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -22,14 +22,95 @@ using CreatePluginEngineFunc = void(std::shared_ptr<::ov::IPlugin>&); const std::string DEFAULT_DEVICE_NAME = "DEFAULT_DEVICE"; +class CoreConfig final { +public: + CoreConfig() = default; + CoreConfig(const CoreConfig& other); + CoreConfig& operator=(const CoreConfig&) = delete; + + struct CacheConfig { + std::string _cacheDir; + std::shared_ptr _cacheManager; + + static CacheConfig create(const std::string& dir); + }; + + void set(const ov::AnyMap& config); + + /** + * @brief Removes core-level properties from config and triggers new state for core config + * @param config - config to be updated + */ + void set_and_update(ov::AnyMap& config); + + OPENVINO_DEPRECATED("Don't use this method, it will be removed soon") + void set_cache_dir_for_device(const std::string& dir, const std::string& name); + + std::string get_cache_dir() const; + + bool get_enable_mmap() const; + + CacheConfig get_cache_config_for_device(const ov::Plugin& plugin, ov::AnyMap& parsedConfig) const; + + // Creating thread-safe copy of global config including shared_ptr to ICacheManager + CacheConfig get_cache_config_for_device(const ov::Plugin& plugin) const; + + // remove core properties + static void remove_core(ov::AnyMap& config); + static void remove_core_skip_cache_dir(ov::AnyMap& config); + +private: + mutable std::mutex _cacheConfigMutex; + CacheConfig _cacheConfig; + std::map _cacheConfigPerDevice; + bool _flag_enable_mmap = true; +}; + struct Parsed { std::string _deviceName; AnyMap _config; + CoreConfig _core_config; }; +/** + * @brief Provides Parsed device name and configuration. + * + * Uses default core configuration updated with user properties from config. + * The core properties are removed from user configuration for HW devices only. + * @note The `CACHE_DIR` is not removed from compiled configuration. + * + * @param deviceName Device name to be parsed + * @param config User configuration to be parsed. + * @param keep_auto_batch_property If set keep auto batch properties in compile properties. + * @return Parsed: + * - device name + * - compile properties + * - core configuration + */ +Parsed parseDeviceNameIntoConfig(const std::string& deviceName, + const AnyMap& config = {}, + const bool keep_auto_batch_property = false); + +/** + * @brief Provides Parsed device name and configuration. + * + * Uses user core configuration which is updated with user properties from config. + * The core properties are removed from user configuration for HW devices only. + * @note The `CACHE_DIR` is not removed from compiled configuration. + * + * @param deviceName Device name to be parsed + * @param coreConfig Core configuration used as base for parsed output. + * @param config User configuration to be parsed. + * @param keep_auto_batch_property If set keep auto batch properties in compile properties. + * @return Parsed: + * - device name + * - compile properties + * - core configuration + */ Parsed parseDeviceNameIntoConfig(const std::string& deviceName, + const CoreConfig& coreConfig, const AnyMap& config = {}, - const bool keep_core_property = false); + const bool keep_auto_batch_property = false); /** * @brief Checks whether config is applicable for device with 'device_name' @@ -61,47 +142,17 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this _cacheManager; - - static CacheConfig create(const std::string& dir); - }; - - /** - * @brief Removes core-level properties from config and triggers new state for core config - * @param config - config to be updated - */ - void set_and_update(ov::AnyMap& config); - - OPENVINO_DEPRECATED("Don't use this method, it will be removed soon") - void set_cache_dir_for_device(const std::string& dir, const std::string& name); - - std::string get_cache_dir() const; - - bool get_enable_mmap() const; - - // Creating thread-safe copy of config including shared_ptr to ICacheManager - // Passing empty or not-existing name will return global cache config - CacheConfig get_cache_config_for_device(const ov::Plugin& plugin, ov::AnyMap& parsedConfig) const; - - private: - mutable std::mutex _cacheConfigMutex; - CacheConfig _cacheConfig; - std::map _cacheConfigPerDevice; - bool _flag_enable_mmap = true; - }; - struct CacheContent { explicit CacheContent(const std::shared_ptr& cache_manager, + bool mmap_enabled = false, const std::string model_path = {}) : cacheManager(cache_manager), - modelPath(model_path) {} + modelPath(model_path), + mmap_enabled{mmap_enabled} {} std::shared_ptr cacheManager; std::string blobId = {}; std::string modelPath = {}; + bool mmap_enabled = false; }; // Core settings (cache config, etc) @@ -256,7 +307,9 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this read_model(const std::shared_ptr& model, const std::shared_ptr& weights) const override; - std::shared_ptr read_model(const std::string& model_path, const std::string& bin_path) const override; + std::shared_ptr read_model(const std::string& model_path, + const std::string& bin_path, + const AnyMap& properties) const override; ov::SoPtr compile_model(const std::shared_ptr& model, const std::string& device_name, @@ -291,7 +344,9 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this create_context(const std::string& device_name, const AnyMap& args) const override; - ov::AnyMap get_supported_property(const std::string& device_name, const ov::AnyMap& config, const bool keep_core_property = true) const override; + ov::AnyMap get_supported_property(const std::string& device_name, + const ov::AnyMap& config, + const bool keep_core_property = true) const override; ov::SoPtr get_default_context(const std::string& device_name) const override; diff --git a/src/inference/src/dev/icompiled_model.cpp b/src/inference/src/dev/icompiled_model.cpp index b1cbedac1632ab..3f4a8d397ab4d9 100644 --- a/src/inference/src/dev/icompiled_model.cpp +++ b/src/inference/src/dev/icompiled_model.cpp @@ -10,6 +10,10 @@ #include "openvino/runtime/properties.hpp" #include "transformations/utils/utils.hpp" +#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) +# include +#endif + ov::ICompiledModel::ICompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const std::shared_ptr& task_executor, @@ -47,8 +51,11 @@ ov::ICompiledModel::ICompiledModel(const std::shared_ptr& model } } } - - std::unordered_map, std::shared_ptr> tensor_map; + std::unordered_map, + std::shared_ptr, + descriptor::TensorExtension::Hasher, + descriptor::TensorExtension::Equal> + tensor_map; for (const auto& param : model->get_parameters()) { const auto& param_name = param->get_friendly_name(); auto new_param = ov::as_type_ptr(param->copy_with_new_inputs({})); @@ -151,3 +158,12 @@ void ov::ICompiledModel::set_model_shared_object(ov::Model& model, const std::sh void ov::ICompiledModel::release_memory() { // nothing to do } + +ov::ICompiledModel::~ICompiledModel() { +#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) + // Linux memory margent doesn't return system memory immediate after release. + // It depends on memory chunk size and allocation history. + // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process. + malloc_trim(0); +#endif +} diff --git a/src/inference/src/dev/iplugin.cpp b/src/inference/src/dev/iplugin.cpp index 1049e39bee6f49..f8c49825ba435a 100644 --- a/src/inference/src/dev/iplugin.cpp +++ b/src/inference/src/dev/iplugin.cpp @@ -4,6 +4,7 @@ #include "openvino/runtime/iplugin.hpp" +#include "core_impl.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/util/op_types.hpp" #include "openvino/op/util/shape_of_base.hpp" @@ -75,8 +76,10 @@ std::shared_ptr ov::IPlugin::compile_model(const std::string const ov::AnyMap& properties) const { auto core = get_core(); OPENVINO_ASSERT(core); - auto model = core->read_model(model_path, std::string()); - return compile_model(model, properties); + const auto model = core->read_model(model_path, {}, properties); + auto local_properties = properties; + CoreConfig::remove_core_skip_cache_dir(local_properties); + return compile_model(model, local_properties); } std::unordered_set ov::get_supported_nodes( diff --git a/src/inference/src/model_reader.cpp b/src/inference/src/model_reader.cpp index aaf620ea0f803a..7babef019b5802 100644 --- a/src/inference/src/model_reader.cpp +++ b/src/inference/src/model_reader.cpp @@ -86,7 +86,7 @@ void update_v10_model(std::shared_ptr& model, bool frontendMode = fal "Model operation names have collisions with tensor names.", " Please use MO to generate new IR version, it should allow to avoid the issue"); leaf_names.emplace(res_name, nullptr); - result->output(0).get_tensor().add_names({std::move(res_name)}); + result->input(0).get_tensor().add_names({std::move(res_name)}); } for (const auto& param : model->get_parameters()) { const auto& param_name = param->get_friendly_name(); diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp index f8bd16173b8fce..48d486d2ed2d1b 100644 --- a/src/inference/src/os/lin/lin_system_conf.cpp +++ b/src/inference/src/os/lin/lin_system_conf.cpp @@ -23,76 +23,108 @@ CPU::CPU() { std::vector> system_info_table; std::vector node_info_table; - auto get_cache_info_linux = [&]() { + constexpr int cache_info_mode = 1; + constexpr int freq_info_mode = 2; + + auto get_info_linux = [&](int mode) { int cpu_index = 0; int cache_index = 0; int cache_files = 3; - std::vector one_info(cache_files); + std::string one_info; - while (1) { - for (int n = 0; n < cache_files; n++) { - cache_index = (n == 0) ? n : n + 1; - - std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + "/cache/index" + - std::to_string(cache_index) + "/shared_cpu_list"); - if (!cache_file.is_open()) { - cache_index = -1; - break; - } - std::string cache_info; - std::getline(cache_file, cache_info); - one_info[n] = std::move(cache_info); - } + std::vector file_name = {"/topology/core_cpus_list", + "/topology/physical_package_id", + "/cpufreq/cpuinfo_max_freq"}; + int num_of_files = file_name.size(); - if (cache_index == -1) { - if (cpu_index == 0) { - return -1; - } else { - return 0; - } - } else { - system_info_table.push_back(one_info); - cpu_index++; - } + std::string::size_type pos = 0; + std::string::size_type endpos = 0; + std::string sub_str; + + int core_1; + int core_2; + + system_info_table.clear(); + + std::ifstream possible_file("/sys/devices/system/cpu/possible"); + std::string possible_info; + + if (possible_file.is_open()) { + std::getline(possible_file, possible_info); + } else { + return -1; } - return 0; - }; + if ((endpos = possible_info.find('-', pos)) != std::string::npos) { + sub_str = possible_info.substr(pos, endpos - pos); + core_1 = std::stoi(sub_str); + sub_str = possible_info.substr(endpos + 1); + core_2 = std::stoi(sub_str); + system_info_table.resize(core_2 + 1, std::vector(cache_files, "")); + } else { + return -1; + } - auto get_freq_info_linux = [&]() { - int cpu_index = 0; - int cache_index = 0; + std::ifstream online_file("/sys/devices/system/cpu/online"); + std::string online_info; - std::vector file_name = {"/topology/core_cpus_list", - "/topology/physical_package_id", - "/cpufreq/cpuinfo_max_freq"}; - int num_of_files = file_name.size(); - std::vector one_info(num_of_files); + if (online_file.is_open()) { + std::getline(online_file, online_info); + } else { + system_info_table.clear(); + return -1; + } while (1) { - for (int n = 0; n < num_of_files; n++) { - cache_index = n; + if ((endpos = online_info.find('-', pos)) != std::string::npos) { + sub_str = online_info.substr(pos, endpos - pos); + core_1 = std::stoi(sub_str); + sub_str = online_info.substr(endpos + 1); + core_2 = std::stoi(sub_str); - std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + file_name[n]); - if (!cache_file.is_open()) { - cache_index = -1; - break; + for (cpu_index = core_1; cpu_index <= core_2; cpu_index++) { + if (mode == cache_info_mode) { + for (int n = 0; n < cache_files; n++) { + cache_index = (n == 0) ? n : n + 1; + one_info.clear(); + + std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + + "/cache/index" + std::to_string(cache_index) + "/shared_cpu_list"); + if (cache_file.is_open()) { + std::getline(cache_file, one_info); + } else { + if ((cpu_index == core_1) && (n == 0)) { + system_info_table.clear(); + return -1; + } + } + system_info_table[cpu_index][n] = std::move(one_info); + } + } else { + for (int n = 0; n < num_of_files; n++) { + one_info.clear(); + + std::ifstream cache_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu_index) + + file_name[n]); + if (cache_file.is_open()) { + std::getline(cache_file, one_info); + } else { + if ((cpu_index == core_1) && (n == 2)) { + system_info_table.clear(); + return -1; + } + } + system_info_table[cpu_index][n] = std::move(one_info); + } + } } - std::string cache_info; - std::getline(cache_file, cache_info); - one_info[n] = std::move(cache_info); } - if (cache_index == -1) { - if (cpu_index == 0) { - return -1; - } else { - return 0; - } + if ((pos = online_info.find(',', endpos)) != std::string::npos) { + pos++; } else { - system_info_table.push_back(one_info); - cpu_index++; + break; } } @@ -201,7 +233,7 @@ CPU::CPU() { get_node_info_linux(); - if (!get_cache_info_linux()) { + if (!get_info_linux(cache_info_mode)) { parse_cache_info_linux(system_info_table, node_info_table, _processors, @@ -215,7 +247,7 @@ CPU::CPU() { if ((_proc_type_table.size() == 0) || ((_proc_type_table[0][MAIN_CORE_PROC] == 0) && (_proc_type_table[0][ALL_PROC] > 0) && (_proc_type_table[0][ALL_PROC] != _proc_type_table[0][EFFICIENT_CORE_PROC]))) { - if (!get_freq_info_linux()) { + if (!get_info_linux(freq_info_mode)) { parse_freq_info_linux(system_info_table, node_info_table, _processors, @@ -471,56 +503,73 @@ void parse_cache_info_linux(const std::vector> system_i const std::vector line_value_0({0, 0, 0, 0, -1, -1}); - for (int n = 0; n < _processors; n++) { - if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { - std::string::size_type pos = 0; - std::string::size_type endpos = 0; - std::string sub_str; - - int core_1; - int core_2; + std::vector offline_list; + int info_index = 0; - if (0 == _sockets) { - _proc_type_table.push_back(line_value_0); - } else { - _proc_type_table.push_back(_proc_type_table[0]); - _proc_type_table[0] = line_value_0; - } - - while (1) { - if ((endpos = system_info_table[n][2].find('-', pos)) != std::string::npos) { - sub_str = system_info_table[n][2].substr(pos, endpos - pos); - core_1 = std::stoi(sub_str); - sub_str = system_info_table[n][2].substr(endpos + 1); - core_2 = std::stoi(sub_str); + for (int n = 0; n < _processors; n++) { + if ((system_info_table[n][2].size() > 0) || (system_info_table[n][1].size() > 0)) { + info_index = system_info_table[n][2].size() > 0 ? 2 : 1; + if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { + std::string::size_type pos = 0; + std::string::size_type endpos = 0; + std::string sub_str; + + int core_1; + int core_2; + + if (0 == _sockets) { + _proc_type_table.push_back(line_value_0); + } else { + _proc_type_table.push_back(_proc_type_table[0]); + _proc_type_table[0] = line_value_0; + } - for (int m = core_1; m <= core_2; m++) { - _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets; - _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID]; - update_proc_map_info(m); + while (1) { + if ((endpos = system_info_table[n][info_index].find('-', pos)) != std::string::npos) { + sub_str = system_info_table[n][info_index].substr(pos, endpos - pos); + core_1 = std::stoi(sub_str); + sub_str = system_info_table[n][info_index].substr(endpos + 1); + core_2 = std::stoi(sub_str); + + if ((info_index == 1) && (core_2 - core_1 == 1)) { + offline_list.push_back(n); + break; + } + for (int m = core_1; m <= core_2; m++) { + _cpu_mapping_table[m][CPU_MAP_SOCKET_ID] = _sockets; + _cpu_mapping_table[m][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[m][CPU_MAP_SOCKET_ID]; + update_proc_map_info(m); + if (_processors == 0) { + return; + }; + } + } else if (pos != std::string::npos) { + sub_str = system_info_table[n][info_index].substr(pos); + core_1 = std::stoi(sub_str); + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = _sockets; + _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + update_proc_map_info(core_1); if (_processors == 0) { return; }; + endpos = pos; } - } else if (pos != std::string::npos) { - sub_str = system_info_table[n][2].substr(pos); - core_1 = std::stoi(sub_str); - _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = _sockets; - _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - update_proc_map_info(core_1); - if (_processors == 0) { - return; - }; - endpos = pos; - } - if ((pos = system_info_table[n][2].find(',', endpos)) != std::string::npos) { - pos++; - } else { - break; + if ((pos = system_info_table[n][2].find(',', endpos)) != std::string::npos) { + pos++; + } else { + break; + } + } + _sockets++; + if (_proc_type_table[0][ALL_PROC] == 0) { + _proc_type_table.erase(_proc_type_table.begin()); + _sockets--; } } - _sockets++; + } else { + offline_list.push_back(n); } } @@ -540,6 +589,11 @@ void parse_cache_info_linux(const std::vector> system_i _numa_nodes = node_info_table.size(); parse_node_info_linux(node_info_table, _numa_nodes, _sockets, _proc_type_table, _cpu_mapping_table); } + + for (size_t n = 0; n < offline_list.size(); n++) { + _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n); + _processors--; + } }; void get_cpu_mapping_from_cores(const int _processors, @@ -615,7 +669,6 @@ void parse_freq_info_linux(const std::vector> system_in std::vector>& _cpu_mapping_table) { int freq_max = 0; bool ecore_enabled = false; - bool ht_enabled = false; _processors = system_info_table.size(); _numa_nodes = 0; @@ -625,6 +678,8 @@ void parse_freq_info_linux(const std::vector> system_in std::vector line_value_0(PROC_TYPE_TABLE_SIZE, 0); + std::vector offline_list; + auto clean_up_output = [&]() { _processors = 0; _cores = 0; @@ -636,65 +691,68 @@ void parse_freq_info_linux(const std::vector> system_in }; for (int n = 0; n < _processors; n++) { - if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { - std::string::size_type pos = 0; - std::string::size_type endpos1 = 0; - std::string::size_type endpos2 = 0; - std::string sub_str; - - int core_1 = 0; - int core_2 = 0; - - if (((endpos1 = system_info_table[n][0].find(',', pos)) != std::string::npos) || - ((endpos2 = system_info_table[n][0].find('-', pos)) != std::string::npos)) { - endpos1 = (endpos1 != std::string::npos) ? endpos1 : endpos2; - sub_str = system_info_table[n][0].substr(pos, endpos1 - pos); - core_1 = std::stoi(sub_str); - sub_str = system_info_table[n][0].substr(endpos1 + 1); - core_2 = std::stoi(sub_str); - if ((core_1 != n) && (core_2 != n)) { - clean_up_output(); - return; - } - - _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; - _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); - _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; - _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = HYPER_THREADING_PROC; - _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; + if (system_info_table[n][2].size() > 0) { + if (-1 == _cpu_mapping_table[n][CPU_MAP_SOCKET_ID]) { + std::string::size_type pos = 0; + std::string::size_type endpos1 = 0; + std::string::size_type endpos2 = 0; + std::string sub_str; + + int core_1 = 0; + int core_2 = 0; + + if (((endpos1 = system_info_table[n][0].find(',', pos)) != std::string::npos) || + ((endpos2 = system_info_table[n][0].find('-', pos)) != std::string::npos)) { + endpos1 = (endpos1 != std::string::npos) ? endpos1 : endpos2; + sub_str = system_info_table[n][0].substr(pos, endpos1 - pos); + core_1 = std::stoi(sub_str); + sub_str = system_info_table[n][0].substr(endpos1 + 1); + core_2 = std::stoi(sub_str); + if ((core_1 != n) && (core_2 != n)) { + clean_up_output(); + return; + } - _cpu_mapping_table[core_2][CPU_MAP_PROCESSOR_ID] = core_2; - _cpu_mapping_table[core_2][CPU_MAP_SOCKET_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_2][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_2][CPU_MAP_CORE_ID] = _cpu_mapping_table[core_1][CPU_MAP_CORE_ID]; - _cpu_mapping_table[core_2][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; - _cpu_mapping_table[core_2][CPU_MAP_GROUP_ID] = _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID]; + _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); + _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; + _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = HYPER_THREADING_PROC; + _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; + + _cpu_mapping_table[core_2][CPU_MAP_PROCESSOR_ID] = core_2; + _cpu_mapping_table[core_2][CPU_MAP_SOCKET_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_2][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_2][CPU_MAP_CORE_ID] = _cpu_mapping_table[core_1][CPU_MAP_CORE_ID]; + _cpu_mapping_table[core_2][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; + _cpu_mapping_table[core_2][CPU_MAP_GROUP_ID] = _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID]; + + int core_freq = std::stoi(system_info_table[core_1][2]); + freq_max = std::max(core_freq, freq_max); + } else if (system_info_table[n][0].size() > 0) { + core_1 = std::stoi(system_info_table[n][0]); - ht_enabled = true; - int core_freq = std::stoi(system_info_table[core_1][2]); - freq_max = std::max(core_freq, freq_max); - } else if (system_info_table[n][0].size() > 0) { - core_1 = std::stoi(system_info_table[n][0]); + _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; + _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); + _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; + _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; - _cpu_mapping_table[core_1][CPU_MAP_PROCESSOR_ID] = core_1; - _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID] = std::stoi(system_info_table[core_1][1]); - _cpu_mapping_table[core_1][CPU_MAP_NUMA_NODE_ID] = _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]; - _cpu_mapping_table[core_1][CPU_MAP_CORE_ID] = _cores; + int core_freq = std::stoi(system_info_table[core_1][2]); + if ((0 == freq_max) || (core_freq >= freq_max * 0.97)) { + freq_max = std::max(core_freq, freq_max); + _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; + } else { + _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = EFFICIENT_CORE_PROC; + ecore_enabled = true; + } - int core_freq = std::stoi(system_info_table[core_1][2]); - if (((0 == freq_max) || (core_freq >= freq_max * 0.95)) && (!ht_enabled)) { - freq_max = std::max(core_freq, freq_max); - _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; - } else { - _cpu_mapping_table[core_1][CPU_MAP_CORE_TYPE] = EFFICIENT_CORE_PROC; - ecore_enabled = true; + _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; } - - _cpu_mapping_table[core_1][CPU_MAP_GROUP_ID] = _cores; + _sockets = std::max(_sockets, _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]); + _cores++; } - _sockets = std::max(_sockets, _cpu_mapping_table[core_1][CPU_MAP_SOCKET_ID]); - _cores++; + } else { + offline_list.push_back(n); } } @@ -733,6 +791,11 @@ void parse_freq_info_linux(const std::vector> system_in _numa_nodes = node_info_table.size(); parse_node_info_linux(node_info_table, _numa_nodes, _sockets, _proc_type_table, _cpu_mapping_table); } + + for (size_t n = 0; n < offline_list.size(); n++) { + _cpu_mapping_table.erase(_cpu_mapping_table.begin() + offline_list[n] - n); + _processors--; + } }; void update_valid_processor_linux(const std::vector phy_core_list, diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp index a4129a80b599ba..97831207df73d5 100644 --- a/src/inference/src/os/win/win_system_conf.cpp +++ b/src/inference/src/os/win/win_system_conf.cpp @@ -92,6 +92,10 @@ void parse_processor_info_win(const char* base_ptr, _cores = 0; _blocked_cores = 0; + constexpr int initial_core_type = -1; + constexpr int group_with_2_cores = 2; + constexpr int group_with_4_cores = 4; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = NULL; auto MaskToList = [&](const KAFFINITY mask_input) { @@ -131,7 +135,7 @@ void parse_processor_info_win(const char* base_ptr, base_proc = _processors; } - if (2 == list_len) { + if (group_with_2_cores == list_len) { proc_info = cpu_init_line; proc_info[CPU_MAP_PROCESSOR_ID] = list[0] + base_proc; proc_info[CPU_MAP_NUMA_NODE_ID] = _sockets; @@ -162,7 +166,11 @@ void parse_processor_info_win(const char* base_ptr, proc_info[CPU_MAP_CORE_ID] = _cores; if ((_processors > group_start) && (_processors <= group_end)) { proc_info[CPU_MAP_CORE_TYPE] = group_type; - proc_info[CPU_MAP_GROUP_ID] = group_id; + if ((group_type == MAIN_CORE_PROC) && (group_end - group_start != 1)) { + proc_info[CPU_MAP_GROUP_ID] = group++; + } else { + proc_info[CPU_MAP_GROUP_ID] = group_id; + } if (group_id == CPU_BLOCKED) { proc_info[CPU_MAP_USED_FLAG] = CPU_BLOCKED; _blocked_cores++; @@ -178,7 +186,7 @@ void parse_processor_info_win(const char* base_ptr, } else if ((info->Relationship == RelationCache) && (info->Cache.Level == 2)) { MaskToList(info->Cache.GroupMask.Mask); - if (4 == list_len) { + if (group_with_4_cores == list_len) { if (_processors <= list[list_len - 1] + base_proc) { group_start = list[0]; group_end = list[list_len - 1]; @@ -190,38 +198,45 @@ void parse_processor_info_win(const char* base_ptr, _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id; _proc_type_table[0][EFFICIENT_CORE_PROC]++; } - } else if ((2 == list_len) && (-1 == _cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE])) { + } else if (group_with_2_cores == list_len) { + if (initial_core_type == _cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE]) { + if (_processors <= list[list_len - 1] + base_proc) { + group_start = list[0]; + group_end = list[list_len - 1]; + if (_proc_type_table[0][EFFICIENT_CORE_PROC] > 0) { + group_id = CPU_BLOCKED; + group_type = EFFICIENT_CORE_PROC; + _blocked_cores++; + } else { + group_id = group++; + group_type = MAIN_CORE_PROC; + } + } + for (int m = 0; m < _processors - list[0]; m++) { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] = group_type; + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id; + if (group_id == CPU_BLOCKED) { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = CPU_BLOCKED; + } else { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = NOT_USED; + _proc_type_table[0][MAIN_CORE_PROC]++; + } + } + } + } else { if (_processors <= list[list_len - 1] + base_proc) { group_start = list[0]; group_end = list[list_len - 1]; - if (_proc_type_table[0][EFFICIENT_CORE_PROC] > 0) { - group_id = CPU_BLOCKED; - group_type = EFFICIENT_CORE_PROC; - _blocked_cores++; - } else { - group_id = group++; - group_type = MAIN_CORE_PROC; - } + group_id = group; + group_type = MAIN_CORE_PROC; } for (int m = 0; m < _processors - list[0]; m++) { - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] = group_type; - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id; - if (group_id == CPU_BLOCKED) { - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = CPU_BLOCKED; - } else { - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = NOT_USED; + if (_cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] == initial_core_type) { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group++; _proc_type_table[0][MAIN_CORE_PROC]++; } } - - } else if (1 == list_len) { - if ((_cpu_mapping_table.size() > list[0]) && - (_cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE] == -1)) { - group_id = group++; - _cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; - _cpu_mapping_table[list[0] + base_proc][CPU_MAP_GROUP_ID] = group_id; - _proc_type_table[0][MAIN_CORE_PROC]++; - } } } } diff --git a/src/inference/src/system_conf.cpp b/src/inference/src/system_conf.cpp index 27c671d07ad5c9..3227b1a3034903 100644 --- a/src/inference/src/system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -22,6 +22,7 @@ # include # define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP (1 << 9) # define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP (1 << 10) +# define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE (1 << 24) #elif defined(__APPLE__) && defined(__aarch64__) # include # include @@ -114,6 +115,10 @@ bool with_cpu_neon_fp16() { return false; } +bool with_cpu_sve() { + return false; +} + #else // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 bool with_cpu_x86_sse42() { @@ -173,6 +178,20 @@ bool with_cpu_neon_fp16() { return false; # endif } +bool with_cpu_sve() { +# if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + !defined(__arm__) && defined(__aarch64__) + const uint32_t hwcaps = getauxval(AT_HWCAP); + return hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE; +# elif !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + !defined(__aarch64__) && defined(__arm__) + return false; +# elif defined(__aarch64__) && defined(__APPLE__) + return false; +# else + return false; +# endif +} #endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 bool check_open_mp_env_vars(bool include_omp_num_threads) { diff --git a/src/inference/tests/functional/caching_test.cpp b/src/inference/tests/functional/caching_test.cpp index 5b01af9a22cde8..e3572dc98915b0 100644 --- a/src/inference/tests/functional/caching_test.cpp +++ b/src/inference/tests/functional/caching_test.cpp @@ -276,14 +276,14 @@ class CachingTest : public ::testing::TestWithParam model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_TRUE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + + ON_CALL(*mockPlugin, get_property(ov::internal::supported_properties.name(), _)) + .WillByDefault(Invoke([&](const std::string&, const ov::AnyMap&) { + return std::vector{ov::internal::caching_properties.name(), + ov::internal::caching_with_mmap.name()}; + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + core.set_property({{ov::cache_dir.name(), m_cacheDir}}); + m_testFunction(core); + m_testFunction(core); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + +TEST_P(CachingTest, Load_mmap_is_disabled) { + ON_CALL(*mockPlugin, import_model(_, _)).WillByDefault(Invoke([&](std::istream& istr, const ov::AnyMap& config) { + if (m_checkConfigCb) { + m_checkConfigCb(config); + } + std::shared_ptr model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_FALSE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + ON_CALL(*mockPlugin, get_property(ov::internal::supported_properties.name(), _)) + .WillByDefault(Invoke([&](const std::string&, const ov::AnyMap&) { + return std::vector{ov::internal::caching_properties.name(), + ov::internal::caching_with_mmap.name()}; + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + core.set_property({{ov::cache_dir.name(), m_cacheDir}}); + core.set_property({ov::enable_mmap(false)}); + m_testFunction(core); + m_testFunction(core); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + +TEST_P(CachingTest, Load_mmap_is_not_supported_by_plugin) { + ON_CALL(*mockPlugin, import_model(_, _)).WillByDefault(Invoke([&](std::istream& istr, const ov::AnyMap& config) { + if (m_checkConfigCb) { + m_checkConfigCb(config); + } + std::shared_ptr model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_FALSE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + core.set_property({{ov::cache_dir.name(), m_cacheDir}}); + core.set_property({ov::enable_mmap(true)}); + m_testFunction(core); + m_testFunction(core); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + +TEST_P(CachingTest, Load_mmap_is_disabled_local_cfg) { + ON_CALL(*mockPlugin, import_model(_, _)).WillByDefault(Invoke([&](std::istream& istr, const ov::AnyMap& config) { + if (m_checkConfigCb) { + m_checkConfigCb(config); + } + std::shared_ptr model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_FALSE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + ON_CALL(*mockPlugin, get_property(ov::internal::supported_properties.name(), _)) + .WillByDefault(Invoke([&](const std::string&, const ov::AnyMap&) { + return std::vector{ov::internal::caching_properties.name(), + ov::internal::caching_with_mmap.name()}; + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + const auto config = ov::AnyMap{{ov::cache_dir(m_cacheDir)}, {ov::enable_mmap(false)}}; + m_testFunctionWithCfg(core, config); + m_testFunctionWithCfg(core, config); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + +TEST_P(CachingTest, Load_mmap_is_not_supported_by_plugin_local_cfg) { + ON_CALL(*mockPlugin, import_model(_, _)).WillByDefault(Invoke([&](std::istream& istr, const ov::AnyMap& config) { + if (m_checkConfigCb) { + m_checkConfigCb(config); + } + std::shared_ptr model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_FALSE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + const auto config = ov::AnyMap{{ov::cache_dir(m_cacheDir)}, {ov::enable_mmap(false)}}; + m_testFunctionWithCfg(core, config); + m_testFunctionWithCfg(core, config); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + #if defined(ENABLE_OV_IR_FRONTEND) static std::string getTestCaseName(const testing::TestParamInfo>& obj) { diff --git a/src/inference/tests/functional/ov_core_test.cpp b/src/inference/tests/functional/ov_core_test.cpp index 26eb38e3fd13e5..60f91b85b3338a 100644 --- a/src/inference/tests/functional/ov_core_test.cpp +++ b/src/inference/tests/functional/ov_core_test.cpp @@ -8,9 +8,26 @@ #include "common_test_utils/common_utils.hpp" #include "common_test_utils/file_utils.hpp" +#include "functional_test_utils/test_model/test_model.hpp" #include "openvino/runtime/core.hpp" #include "openvino/util/file_util.hpp" +class CoreBaseTest : public testing::Test { +protected: + void generate_test_model_files(const std::string& name) { + auto prefix = ov::test::utils::generateTestFilePrefix(); + model_file_name = prefix + name + ".xml"; + weight_file_name = prefix + name + ".bin"; + ov::test::utils::generate_test_model(model_file_name, weight_file_name); + } + + void TearDown() override { + ov::test::utils::removeIRFiles(model_file_name, weight_file_name); + } + + std::string model_file_name, weight_file_name; +}; + #ifndef OPENVINO_STATIC_LIBRARY static void create_plugin_xml(const std::string& file_name, const std::string& plugin_name = "1") { @@ -33,7 +50,7 @@ static void remove_plugin_xml(const std::string& file_name) { ov::test::utils::removeFile(file_name); } -TEST(CoreBaseTest, LoadPluginXML) { +TEST_F(CoreBaseTest, LoadPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -42,7 +59,7 @@ TEST(CoreBaseTest, LoadPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) { +TEST_F(CoreBaseTest, LoadPluginDifferentXMLExtension) { std::string xml_file_name = "test_plugin.test"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -51,7 +68,7 @@ TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { +TEST_F(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -60,7 +77,7 @@ TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -69,7 +86,7 @@ TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadRelativeCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -78,7 +95,7 @@ TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string cwd_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -96,3 +113,43 @@ TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { } #endif + +#if defined(OPENVINO_CPP_VER_17) && defined(ENABLE_OV_IR_FRONTEND) +namespace ov::test { +TEST_F(CoreBaseTest, read_model_with_std_fs_path) { + generate_test_model_files("test-model"); + + const auto model_path = std::filesystem::path(model_file_name); + const auto weight_path = std::filesystem::path(weight_file_name); + + ov::Core core; + { + const auto model = core.read_model(model_path); + EXPECT_NE(model, nullptr); + } + { + const auto model = core.read_model(model_path, weight_path); + EXPECT_NE(model, nullptr); + } +} + +TEST_F(CoreBaseTest, compile_model_with_std_fs_path) { + generate_test_model_files("model2"); + + const auto model_path = std::filesystem::path(model_file_name); + const auto weight_path = std::filesystem::path(weight_file_name); + + ov::Core core; + { + const auto model = core.compile_model(model_path); + EXPECT_TRUE(model); + } + { + const auto devices = core.get_available_devices(); + + const auto model = core.compile_model(model_path, devices.at(0), ov::AnyMap{}); + EXPECT_TRUE(model); + } +} +} // namespace ov::test +#endif diff --git a/src/inference/tests/functional/ov_extension_test.cpp b/src/inference/tests/functional/ov_extension_test.cpp index 6f93a8acdaf2fa..b840c430d092e9 100644 --- a/src/inference/tests/functional/ov_extension_test.cpp +++ b/src/inference/tests/functional/ov_extension_test.cpp @@ -82,6 +82,12 @@ class CustomReLU : public ov::op::Op { }; #if defined(ENABLE_OV_IR_FRONTEND) +# ifdef OPENVINO_CPP_VER_17 +TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsPathLib) { + core.add_extension(std::filesystem::path(getOVExtensionPath())); + test(); +} +# endif TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsLib) { core.add_extension(getOVExtensionPath()); diff --git a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp index 8679090b9ae491..9ea43bd0604296 100644 --- a/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp +++ b/src/inference/tests/unit/cpu_map_parser/cache_parser_linux.cpp @@ -385,6 +385,188 @@ LinuxCpuMapTestCase cache_1sockets_96cores = { {"0-95"}, }, }; +LinuxCpuMapTestCase cache_2sockets_56cores_hyperthreading = { + 110, + 2, + 2, + 56, + {{110, 56, 0, 54, -1, -1}, {54, 28, 0, 26, 0, 0}, {56, 28, 0, 28, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {11, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {12, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {13, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {14, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {15, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {16, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {17, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {18, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {19, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {21, 0, 0, 19, HYPER_THREADING_PROC, 19, -1}, + {22, 0, 0, 20, HYPER_THREADING_PROC, 20, -1}, {23, 0, 0, 21, HYPER_THREADING_PROC, 21, -1}, + {24, 0, 0, 22, HYPER_THREADING_PROC, 22, -1}, {25, 0, 0, 23, HYPER_THREADING_PROC, 23, -1}, + {26, 0, 0, 24, HYPER_THREADING_PROC, 24, -1}, {27, 0, 0, 25, HYPER_THREADING_PROC, 25, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 1, 1, 36, HYPER_THREADING_PROC, 36, -1}, {37, 1, 1, 37, HYPER_THREADING_PROC, 37, -1}, + {38, 1, 1, 38, HYPER_THREADING_PROC, 38, -1}, {39, 1, 1, 39, HYPER_THREADING_PROC, 39, -1}, + {40, 1, 1, 40, HYPER_THREADING_PROC, 40, -1}, {41, 1, 1, 41, HYPER_THREADING_PROC, 41, -1}, + {42, 1, 1, 42, HYPER_THREADING_PROC, 42, -1}, {43, 1, 1, 43, HYPER_THREADING_PROC, 43, -1}, + {44, 1, 1, 44, HYPER_THREADING_PROC, 44, -1}, {45, 1, 1, 45, HYPER_THREADING_PROC, 45, -1}, + {46, 1, 1, 46, HYPER_THREADING_PROC, 46, -1}, {47, 1, 1, 47, HYPER_THREADING_PROC, 47, -1}, + {48, 1, 1, 48, HYPER_THREADING_PROC, 48, -1}, {49, 1, 1, 49, HYPER_THREADING_PROC, 49, -1}, + {50, 1, 1, 50, HYPER_THREADING_PROC, 50, -1}, {51, 1, 1, 51, HYPER_THREADING_PROC, 51, -1}, + {52, 1, 1, 52, HYPER_THREADING_PROC, 52, -1}, {53, 1, 1, 53, HYPER_THREADING_PROC, 53, -1}, + {54, 1, 1, 54, HYPER_THREADING_PROC, 54, -1}, {55, 1, 1, 55, HYPER_THREADING_PROC, 55, -1}, + {56, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {57, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {58, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {59, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {60, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, {61, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {62, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, {63, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {64, 0, 0, 8, MAIN_CORE_PROC, 8, -1}, {65, 0, 0, 9, MAIN_CORE_PROC, 9, -1}, + {66, 0, 0, 26, MAIN_CORE_PROC, 26, -1}, {67, 0, 0, 10, MAIN_CORE_PROC, 10, -1}, + {68, 0, 0, 11, MAIN_CORE_PROC, 11, -1}, {69, 0, 0, 12, MAIN_CORE_PROC, 12, -1}, + {70, 0, 0, 13, MAIN_CORE_PROC, 13, -1}, {71, 0, 0, 14, MAIN_CORE_PROC, 14, -1}, + {72, 0, 0, 15, MAIN_CORE_PROC, 15, -1}, {73, 0, 0, 16, MAIN_CORE_PROC, 16, -1}, + {74, 0, 0, 17, MAIN_CORE_PROC, 17, -1}, {75, 0, 0, 18, MAIN_CORE_PROC, 18, -1}, + {76, 0, 0, 27, MAIN_CORE_PROC, 27, -1}, {77, 0, 0, 19, MAIN_CORE_PROC, 19, -1}, + {78, 0, 0, 20, MAIN_CORE_PROC, 20, -1}, {79, 0, 0, 21, MAIN_CORE_PROC, 21, -1}, + {80, 0, 0, 22, MAIN_CORE_PROC, 22, -1}, {81, 0, 0, 23, MAIN_CORE_PROC, 23, -1}, + {82, 0, 0, 24, MAIN_CORE_PROC, 24, -1}, {83, 0, 0, 25, MAIN_CORE_PROC, 25, -1}, + {84, 1, 1, 28, MAIN_CORE_PROC, 28, -1}, {85, 1, 1, 29, MAIN_CORE_PROC, 29, -1}, + {86, 1, 1, 30, MAIN_CORE_PROC, 30, -1}, {87, 1, 1, 31, MAIN_CORE_PROC, 31, -1}, + {88, 1, 1, 32, MAIN_CORE_PROC, 32, -1}, {89, 1, 1, 33, MAIN_CORE_PROC, 33, -1}, + {90, 1, 1, 34, MAIN_CORE_PROC, 34, -1}, {91, 1, 1, 35, MAIN_CORE_PROC, 35, -1}, + {92, 1, 1, 36, MAIN_CORE_PROC, 36, -1}, {93, 1, 1, 37, MAIN_CORE_PROC, 37, -1}, + {94, 1, 1, 38, MAIN_CORE_PROC, 38, -1}, {95, 1, 1, 39, MAIN_CORE_PROC, 39, -1}, + {96, 1, 1, 40, MAIN_CORE_PROC, 40, -1}, {97, 1, 1, 41, MAIN_CORE_PROC, 41, -1}, + {98, 1, 1, 42, MAIN_CORE_PROC, 42, -1}, {99, 1, 1, 43, MAIN_CORE_PROC, 43, -1}, + {100, 1, 1, 44, MAIN_CORE_PROC, 44, -1}, {101, 1, 1, 45, MAIN_CORE_PROC, 45, -1}, + {102, 1, 1, 46, MAIN_CORE_PROC, 46, -1}, {103, 1, 1, 47, MAIN_CORE_PROC, 47, -1}, + {104, 1, 1, 48, MAIN_CORE_PROC, 48, -1}, {105, 1, 1, 49, MAIN_CORE_PROC, 49, -1}, + {106, 1, 1, 50, MAIN_CORE_PROC, 50, -1}, {107, 1, 1, 51, MAIN_CORE_PROC, 51, -1}, + {108, 1, 1, 52, MAIN_CORE_PROC, 52, -1}, {109, 1, 1, 53, MAIN_CORE_PROC, 53, -1}, + {110, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {111, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + }, + { + {"0,56", "0,56", "0-9,11-19,21-27,56-83"}, + {"1,57", "1,57", "0-9,11-19,21-27,56-83"}, + {"2,58", "2,58", "0-9,11-19,21-27,56-83"}, + {"3,59", "3,59", "0-9,11-19,21-27,56-83"}, + {"4,60", "4,60", "0-9,11-19,21-27,56-83"}, + {"5,61", "5,61", "0-9,11-19,21-27,56-83"}, + {"6,62", "6,62", "0-9,11-19,21-27,56-83"}, + {"7,63", "7,63", "0-9,11-19,21-27,56-83"}, + {"8,64", "8,64", "0-9,11-19,21-27,56-83"}, + {"9,65", "9,65", "0-9,11-19,21-27,56-83"}, + {"", "", ""}, + {"11,67", "11,67", "0-9,11-19,21-27,56-83"}, + {"12,68", "12,68", "0-9,11-19,21-27,56-83"}, + {"13,69", "13,69", "0-9,11-19,21-27,56-83"}, + {"14,70", "14,70", "0-9,11-19,21-27,56-83"}, + {"15,71", "15,71", "0-9,11-19,21-27,56-83"}, + {"16,72", "16,72", "0-9,11-19,21-27,56-83"}, + {"17,73", "17,73", "0-9,11-19,21-27,56-83"}, + {"18,74", "18,74", "0-9,11-19,21-27,56-83"}, + {"19,75", "19,75", "0-9,11-19,21-27,56-83"}, + {"", "", ""}, + {"21,77", "21,77", "0-9,11-19,21-27,56-83"}, + {"22,78", "22,78", "0-9,11-19,21-27,56-83"}, + {"23,79", "23,79", "0-9,11-19,21-27,56-83"}, + {"24,80", "24,80", "0-9,11-19,21-27,56-83"}, + {"25,81", "25,81", "0-9,11-19,21-27,56-83"}, + {"26,82", "26,82", "0-9,11-19,21-27,56-83"}, + {"27,83", "27,83", "0-9,11-19,21-27,56-83"}, + {"28,84", "28,84", "28-55,84-111"}, + {"29,85", "29,85", "28-55,84-111"}, + {"30,86", "30,86", "28-55,84-111"}, + {"31,87", "31,87", "28-55,84-111"}, + {"32,88", "32,88", "28-55,84-111"}, + {"33,89", "33,89", "28-55,84-111"}, + {"34,90", "34,90", "28-55,84-111"}, + {"35,91", "35,91", "28-55,84-111"}, + {"36,92", "36,92", "28-55,84-111"}, + {"37,93", "37,93", "28-55,84-111"}, + {"38,94", "38,94", "28-55,84-111"}, + {"39,95", "39,95", "28-55,84-111"}, + {"40,96", "40,96", "28-55,84-111"}, + {"41,97", "41,97", "28-55,84-111"}, + {"42,98", "42,98", "28-55,84-111"}, + {"43,99", "43,99", "28-55,84-111"}, + {"44,100", "44,100", "28-55,84-111"}, + {"45,101", "45,101", "28-55,84-111"}, + {"46,102", "46,102", "28-55,84-111"}, + {"47,103", "47,103", "28-55,84-111"}, + {"48,104", "48,104", "28-55,84-111"}, + {"49,105", "49,105", "28-55,84-111"}, + {"50,106", "50,106", "28-55,84-111"}, + {"51,107", "51,107", "28-55,84-111"}, + {"52,108", "52,108", "28-55,84-111"}, + {"53,109", "53,109", "28-55,84-111"}, + {"54,110", "54,110", "28-55,84-111"}, + {"55,111", "55,111", "28-55,84-111"}, + {"0,56", "0,56", "0-9,11-19,21-27,56-83"}, + {"1,57", "1,57", "0-9,11-19,21-27,56-83"}, + {"2,58", "2,58", "0-9,11-19,21-27,56-83"}, + {"3,59", "3,59", "0-9,11-19,21-27,56-83"}, + {"4,60", "4,60", "0-9,11-19,21-27,56-83"}, + {"5,61", "5,61", "0-9,11-19,21-27,56-83"}, + {"6,62", "6,62", "0-9,11-19,21-27,56-83"}, + {"7,63", "7,63", "0-9,11-19,21-27,56-83"}, + {"8,64", "8,64", "0-9,11-19,21-27,56-83"}, + {"9,65", "9,65", "0-9,11-19,21-27,56-83"}, + {"66", "66", "0-9,11-19,21-27,56-83"}, + {"11,67", "11,67", "0-9,11-19,21-27,56-83"}, + {"12,68", "12,68", "0-9,11-19,21-27,56-83"}, + {"13,69", "13,69", "0-9,11-19,21-27,56-83"}, + {"14,70", "14,70", "0-9,11-19,21-27,56-83"}, + {"15,71", "15,71", "0-9,11-19,21-27,56-83"}, + {"16,72", "16,72", "0-9,11-19,21-27,56-83"}, + {"17,73", "17,73", "0-9,11-19,21-27,56-83"}, + {"18,74", "18,74", "0-9,11-19,21-27,56-83"}, + {"19,75", "19,75", "0-9,11-19,21-27,56-83"}, + {"76", "76", "0-9,11-19,21-27,56-83"}, + {"21,77", "21,77", "0-9,11-19,21-27,56-83"}, + {"22,78", "22,78", "0-9,11-19,21-27,56-83"}, + {"23,79", "23,79", "0-9,11-19,21-27,56-83"}, + {"24,80", "24,80", "0-9,11-19,21-27,56-83"}, + {"25,81", "25,81", "0-9,11-19,21-27,56-83"}, + {"26,82", "26,82", "0-9,11-19,21-27,56-83"}, + {"27,83", "27,83", "0-9,11-19,21-27,56-83"}, + {"28,84", "28,84", "28-55,84-111"}, + {"29,85", "29,85", "28-55,84-111"}, + {"30,86", "30,86", "28-55,84-111"}, + {"31,87", "31,87", "28-55,84-111"}, + {"32,88", "32,88", "28-55,84-111"}, + {"33,89", "33,89", "28-55,84-111"}, + {"34,90", "34,90", "28-55,84-111"}, + {"35,91", "35,91", "28-55,84-111"}, + {"36,92", "36,92", "28-55,84-111"}, + {"37,93", "37,93", "28-55,84-111"}, + {"38,94", "38,94", "28-55,84-111"}, + {"39,95", "39,95", "28-55,84-111"}, + {"40,96", "40,96", "28-55,84-111"}, + {"41,97", "41,97", "28-55,84-111"}, + {"42,98", "42,98", "28-55,84-111"}, + {"43,99", "43,99", "28-55,84-111"}, + {"44,100", "44,100", "28-55,84-111"}, + {"45,101", "45,101", "28-55,84-111"}, + {"46,102", "46,102", "28-55,84-111"}, + {"47,103", "47,103", "28-55,84-111"}, + {"48,104", "48,104", "28-55,84-111"}, + {"49,105", "49,105", "28-55,84-111"}, + {"50,106", "50,106", "28-55,84-111"}, + {"51,107", "51,107", "28-55,84-111"}, + {"52,108", "52,108", "28-55,84-111"}, + {"53,109", "53,109", "28-55,84-111"}, + {"54,110", "54,110", "28-55,84-111"}, + {"55,111", "55,111", "28-55,84-111"}, + }, + { + {"0-9,11-19,21-27,56-83"}, + {"28-55,84-111"}, + }, +}; LinuxCpuMapTestCase cache_2sockets_48cores_hyperthreading = { 96, 2, @@ -1005,6 +1187,36 @@ LinuxCpuMapTestCase cache_2sockets_20cores_hyperthreading_1 = { }, {}, }; +LinuxCpuMapTestCase cache_1sockets_16cores_hyperthreading = { + 20, + 1, + 1, + 14, + {{20, 6, 8, 6, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, {3, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {4, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {5, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, EFFICIENT_CORE_PROC, 6, -1}, {13, 0, 0, 7, EFFICIENT_CORE_PROC, 6, -1}, + {14, 0, 0, 8, EFFICIENT_CORE_PROC, 6, -1}, {15, 0, 0, 9, EFFICIENT_CORE_PROC, 6, -1}, + {16, 0, 0, 10, EFFICIENT_CORE_PROC, 7, -1}, {17, 0, 0, 11, EFFICIENT_CORE_PROC, 7, -1}, + {18, 0, 0, 12, EFFICIENT_CORE_PROC, 7, -1}, {19, 0, 0, 13, EFFICIENT_CORE_PROC, 7, -1}, + }, + { + {"0,5", "0,5", "0-19"}, {"1-2", "1-2", "0-19"}, {"1-2", "1-2", "0-19"}, {"3-4", "3-4", "0-19"}, + {"3-4", "3-4", "0-19"}, {"0,5", "0,5", "0-19"}, {"6-7", "6-7", "0-19"}, {"6-7", "6-7", "0-19"}, + {"8-9", "8-9", "0-19"}, {"8-9", "8-9", "0-19"}, {"10-11", "10-11", "0-19"}, {"10-11", "10-11", "0-19"}, + {"12", "12-15", "0-19"}, {"13", "12-15", "0-19"}, {"14", "12-15", "0-19"}, {"15", "12-15", "0-19"}, + {"16", "16-19", "0-19"}, {"17", "16-19", "0-19"}, {"18", "16-19", "0-19"}, {"19", "16-19", "0-19"}, + {"20", "20-21", ""}, {"21", "20-21", ""}, + }, + { + {"0-21"}, + }, +}; LinuxCpuMapTestCase cache_1sockets_14cores_hyperthreading = { 20, 1, @@ -1135,6 +1347,36 @@ LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading = { }, {{"0-11"}}, }; +LinuxCpuMapTestCase cache_1sockets_8cores_hyperthreading_1 = { + 8, + 1, + 1, + 8, + {{8, 4, 4, 0, 0, 0}}, + { + {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {2, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {3, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {4, 0, 0, 4, EFFICIENT_CORE_PROC, 4, -1}, + {5, 0, 0, 5, EFFICIENT_CORE_PROC, 4, -1}, + {6, 0, 0, 6, EFFICIENT_CORE_PROC, 4, -1}, + {7, 0, 0, 7, EFFICIENT_CORE_PROC, 4, -1}, + }, + { + {"0", "0", "0-3"}, + {"1", "1", "0-3"}, + {"2", "2", "0-3"}, + {"3", "3", "0-3"}, + {"4", "4-7", ""}, + {"5", "4-7", ""}, + {"6", "4-7", ""}, + {"7", "4-7", ""}, + }, + { + {"0-7"}, + }, +}; LinuxCpuMapTestCase cache_1sockets_6cores_hyperthreading = { 12, 1, @@ -1220,6 +1462,7 @@ INSTANTIATE_TEST_SUITE_P(CPUMap, LinuxCpuMapCacheParserTests, testing::Values(cache_2sockets_104cores_hyperthreading, cache_1sockets_96cores, + cache_2sockets_56cores_hyperthreading, cache_2sockets_48cores_hyperthreading, cache_2sockets_48cores_hyperthreading_1, cache_2sockets_24cores_hyperthreading, @@ -1229,10 +1472,12 @@ INSTANTIATE_TEST_SUITE_P(CPUMap, cache_2sockets_48cores_2, cache_2sockets_20cores_hyperthreading, cache_2sockets_20cores_hyperthreading_1, + cache_1sockets_16cores_hyperthreading, cache_1sockets_14cores_hyperthreading, cache_1sockets_14cores_hyperthreading_1, cache_1sockets_10cores_hyperthreading, cache_1sockets_8cores_hyperthreading, + cache_1sockets_8cores_hyperthreading_1, cache_1sockets_6cores_hyperthreading, cache_1sockets_4cores, cache_VM_cache_0)); diff --git a/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp b/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp index 04ab617961b953..8ccdfad011d19c 100644 --- a/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp +++ b/src/inference/tests/unit/cpu_map_parser/freq_parser_linux.cpp @@ -258,6 +258,188 @@ LinuxCpuMapTestCase freq_2sockets_112cores_hyperthreading = { }, // param[in]: The CPU frequency information table of this simulated platform {{"0-55,112-167"}, {"56-111,168-223"}}, // param[in]: The numa node information table of this simulated platform }; +LinuxCpuMapTestCase freq_2sockets_56cores_hyperthreading = { + 110, + 2, + 2, + 56, + {{110, 56, 0, 54, -1, -1}, {54, 28, 0, 26, 0, 0}, {56, 28, 0, 28, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {11, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {12, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {13, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {14, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {15, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {16, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {17, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {18, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {19, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {21, 0, 0, 19, HYPER_THREADING_PROC, 19, -1}, + {22, 0, 0, 20, HYPER_THREADING_PROC, 20, -1}, {23, 0, 0, 21, HYPER_THREADING_PROC, 21, -1}, + {24, 0, 0, 22, HYPER_THREADING_PROC, 22, -1}, {25, 0, 0, 23, HYPER_THREADING_PROC, 23, -1}, + {26, 0, 0, 24, HYPER_THREADING_PROC, 24, -1}, {27, 0, 0, 25, HYPER_THREADING_PROC, 25, -1}, + {28, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {29, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {30, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {31, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {32, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {33, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {34, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {35, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {36, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {37, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {38, 1, 1, 36, HYPER_THREADING_PROC, 36, -1}, {39, 1, 1, 37, HYPER_THREADING_PROC, 37, -1}, + {40, 1, 1, 38, HYPER_THREADING_PROC, 38, -1}, {41, 1, 1, 39, HYPER_THREADING_PROC, 39, -1}, + {42, 1, 1, 40, HYPER_THREADING_PROC, 40, -1}, {43, 1, 1, 41, HYPER_THREADING_PROC, 41, -1}, + {44, 1, 1, 42, HYPER_THREADING_PROC, 42, -1}, {45, 1, 1, 43, HYPER_THREADING_PROC, 43, -1}, + {46, 1, 1, 44, HYPER_THREADING_PROC, 44, -1}, {47, 1, 1, 45, HYPER_THREADING_PROC, 45, -1}, + {48, 1, 1, 46, HYPER_THREADING_PROC, 46, -1}, {49, 1, 1, 47, HYPER_THREADING_PROC, 47, -1}, + {50, 1, 1, 48, HYPER_THREADING_PROC, 48, -1}, {51, 1, 1, 49, HYPER_THREADING_PROC, 49, -1}, + {52, 1, 1, 50, HYPER_THREADING_PROC, 50, -1}, {53, 1, 1, 51, HYPER_THREADING_PROC, 51, -1}, + {54, 1, 1, 52, HYPER_THREADING_PROC, 52, -1}, {55, 1, 1, 53, HYPER_THREADING_PROC, 53, -1}, + {56, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {57, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {58, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {59, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {60, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, {61, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {62, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, {63, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {64, 0, 0, 8, MAIN_CORE_PROC, 8, -1}, {65, 0, 0, 9, MAIN_CORE_PROC, 9, -1}, + {66, 0, 0, 54, MAIN_CORE_PROC, 54, -1}, {67, 0, 0, 10, MAIN_CORE_PROC, 10, -1}, + {68, 0, 0, 11, MAIN_CORE_PROC, 11, -1}, {69, 0, 0, 12, MAIN_CORE_PROC, 12, -1}, + {70, 0, 0, 13, MAIN_CORE_PROC, 13, -1}, {71, 0, 0, 14, MAIN_CORE_PROC, 14, -1}, + {72, 0, 0, 15, MAIN_CORE_PROC, 15, -1}, {73, 0, 0, 16, MAIN_CORE_PROC, 16, -1}, + {74, 0, 0, 17, MAIN_CORE_PROC, 17, -1}, {75, 0, 0, 18, MAIN_CORE_PROC, 18, -1}, + {76, 0, 0, 55, MAIN_CORE_PROC, 55, -1}, {77, 0, 0, 19, MAIN_CORE_PROC, 19, -1}, + {78, 0, 0, 20, MAIN_CORE_PROC, 20, -1}, {79, 0, 0, 21, MAIN_CORE_PROC, 21, -1}, + {80, 0, 0, 22, MAIN_CORE_PROC, 22, -1}, {81, 0, 0, 23, MAIN_CORE_PROC, 23, -1}, + {82, 0, 0, 24, MAIN_CORE_PROC, 24, -1}, {83, 0, 0, 25, MAIN_CORE_PROC, 25, -1}, + {84, 1, 1, 26, MAIN_CORE_PROC, 26, -1}, {85, 1, 1, 27, MAIN_CORE_PROC, 27, -1}, + {86, 1, 1, 28, MAIN_CORE_PROC, 28, -1}, {87, 1, 1, 29, MAIN_CORE_PROC, 29, -1}, + {88, 1, 1, 30, MAIN_CORE_PROC, 30, -1}, {89, 1, 1, 31, MAIN_CORE_PROC, 31, -1}, + {90, 1, 1, 32, MAIN_CORE_PROC, 32, -1}, {91, 1, 1, 33, MAIN_CORE_PROC, 33, -1}, + {92, 1, 1, 34, MAIN_CORE_PROC, 34, -1}, {93, 1, 1, 35, MAIN_CORE_PROC, 35, -1}, + {94, 1, 1, 36, MAIN_CORE_PROC, 36, -1}, {95, 1, 1, 37, MAIN_CORE_PROC, 37, -1}, + {96, 1, 1, 38, MAIN_CORE_PROC, 38, -1}, {97, 1, 1, 39, MAIN_CORE_PROC, 39, -1}, + {98, 1, 1, 40, MAIN_CORE_PROC, 40, -1}, {99, 1, 1, 41, MAIN_CORE_PROC, 41, -1}, + {100, 1, 1, 42, MAIN_CORE_PROC, 42, -1}, {101, 1, 1, 43, MAIN_CORE_PROC, 43, -1}, + {102, 1, 1, 44, MAIN_CORE_PROC, 44, -1}, {103, 1, 1, 45, MAIN_CORE_PROC, 45, -1}, + {104, 1, 1, 46, MAIN_CORE_PROC, 46, -1}, {105, 1, 1, 47, MAIN_CORE_PROC, 47, -1}, + {106, 1, 1, 48, MAIN_CORE_PROC, 48, -1}, {107, 1, 1, 49, MAIN_CORE_PROC, 49, -1}, + {108, 1, 1, 50, MAIN_CORE_PROC, 50, -1}, {109, 1, 1, 51, MAIN_CORE_PROC, 51, -1}, + {110, 1, 1, 52, MAIN_CORE_PROC, 52, -1}, {111, 1, 1, 53, MAIN_CORE_PROC, 53, -1}, + }, + { + {"0,56", "0", "3500000"}, + {"1,57", "0", "3500000"}, + {"2,58", "0", "3500000"}, + {"3,59", "0", "3500000"}, + {"4,60", "0", "3500000"}, + {"5,61", "0", "3500000"}, + {"6,62", "0", "3500000"}, + {"7,63", "0", "3500000"}, + {"8,64", "0", "3500000"}, + {"9,65", "0", "3500000"}, + {"", "", ""}, + {"11,67", "0", "3500000"}, + {"12,68", "0", "3500000"}, + {"13,69", "0", "3500000"}, + {"14,70", "0", "3500000"}, + {"15,71", "0", "3500000"}, + {"16,72", "0", "3500000"}, + {"17,73", "0", "3500000"}, + {"18,74", "0", "3500000"}, + {"19,75", "0", "3500000"}, + {"", "", ""}, + {"21,77", "0", "3500000"}, + {"22,78", "0", "3500000"}, + {"23,79", "0", "3500000"}, + {"24,80", "0", "3500000"}, + {"25,81", "0", "3500000"}, + {"26,82", "0", "3500000"}, + {"27,83", "0", "3500000"}, + {"28,84", "1", "3500000"}, + {"29,85", "1", "3500000"}, + {"30,86", "1", "3500000"}, + {"31,87", "1", "3500000"}, + {"32,88", "1", "3500000"}, + {"33,89", "1", "3500000"}, + {"34,90", "1", "3500000"}, + {"35,91", "1", "3500000"}, + {"36,92", "1", "3500000"}, + {"37,93", "1", "3500000"}, + {"38,94", "1", "3500000"}, + {"39,95", "1", "3500000"}, + {"40,96", "1", "3500000"}, + {"41,97", "1", "3500000"}, + {"42,98", "1", "3500000"}, + {"43,99", "1", "3500000"}, + {"44,100", "1", "3500000"}, + {"45,101", "1", "3500000"}, + {"46,102", "1", "3500000"}, + {"47,103", "1", "3500000"}, + {"48,104", "1", "3500000"}, + {"49,105", "1", "3500000"}, + {"50,106", "1", "3500000"}, + {"51,107", "1", "3500000"}, + {"52,108", "1", "3500000"}, + {"53,109", "1", "3500000"}, + {"54,110", "1", "3500000"}, + {"55,111", "1", "3500000"}, + {"0,56", "0", "3500000"}, + {"1,57", "0", "3500000"}, + {"2,58", "0", "3500000"}, + {"3,59", "0", "3500000"}, + {"4,60", "0", "3500000"}, + {"5,61", "0", "3500000"}, + {"6,62", "0", "3500000"}, + {"7,63", "0", "3500000"}, + {"8,64", "0", "3500000"}, + {"9,65", "0", "3500000"}, + {"66", "0", "3500000"}, + {"11,67", "0", "3500000"}, + {"12,68", "0", "3500000"}, + {"13,69", "0", "3500000"}, + {"14,70", "0", "3500000"}, + {"15,71", "0", "3500000"}, + {"16,72", "0", "3500000"}, + {"17,73", "0", "3500000"}, + {"18,74", "0", "3500000"}, + {"19,75", "0", "3500000"}, + {"76", "0", "3500000"}, + {"21,77", "0", "3500000"}, + {"22,78", "0", "3500000"}, + {"23,79", "0", "3500000"}, + {"24,80", "0", "3500000"}, + {"25,81", "0", "3500000"}, + {"26,82", "0", "3500000"}, + {"27,83", "0", "3500000"}, + {"28,84", "1", "3500000"}, + {"29,85", "1", "3500000"}, + {"30,86", "1", "3500000"}, + {"31,87", "1", "3500000"}, + {"32,88", "1", "3500000"}, + {"33,89", "1", "3500000"}, + {"34,90", "1", "3500000"}, + {"35,91", "1", "3500000"}, + {"36,92", "1", "3500000"}, + {"37,93", "1", "3500000"}, + {"38,94", "1", "3500000"}, + {"39,95", "1", "3500000"}, + {"40,96", "1", "3500000"}, + {"41,97", "1", "3500000"}, + {"42,98", "1", "3500000"}, + {"43,99", "1", "3500000"}, + {"44,100", "1", "3500000"}, + {"45,101", "1", "3500000"}, + {"46,102", "1", "3500000"}, + {"47,103", "1", "3500000"}, + {"48,104", "1", "3500000"}, + {"49,105", "1", "3500000"}, + {"50,106", "1", "3500000"}, + {"51,107", "1", "3500000"}, + {"52,108", "1", "3500000"}, + {"53,109", "1", "3500000"}, + {"54,110", "1", "3500000"}, + {"55,111", "1", "3500000"}, + }, + { + {"0-9,11-19,21-27,56-83"}, + {"28-55,84-111"}, + }, +}; LinuxCpuMapTestCase freq_2sockets_48cores_hyperthreading = { 96, 2, @@ -987,6 +1169,7 @@ TEST_P(LinuxCpuMapFreqParserTests, LinuxFreq) {} INSTANTIATE_TEST_SUITE_P(CPUMap, LinuxCpuMapFreqParserTests, testing::Values(freq_2sockets_112cores_hyperthreading, + freq_2sockets_56cores_hyperthreading, freq_2sockets_48cores_hyperthreading, freq_2sockets_48cores_hyperthreading_1, freq_2sockets_24cores_hyperthreading, diff --git a/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp b/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp index 2c87b97562e975..280594b8ec897f 100644 --- a/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp +++ b/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp @@ -1448,6 +1448,59 @@ WinCpuMapTestCase _2sockets_48cores = { "fffff0000"}, }; +WinCpuMapTestCase _2sockets_16cores = { + 16, + 2, + 2, + 16, + 0, + {{16, 16, 0, 0, -1, -1}, {8, 8, 0, 0, 0, 0}, {8, 8, 0, 0, 1, 1}}, + { + {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {2, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {3, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {4, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {5, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {6, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {7, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {8, 1, 1, 8, MAIN_CORE_PROC, 8, -1}, + {9, 1, 1, 9, MAIN_CORE_PROC, 9, -1}, + {10, 1, 1, 10, MAIN_CORE_PROC, 10, -1}, + {11, 1, 1, 11, MAIN_CORE_PROC, 11, -1}, + {12, 1, 1, 12, MAIN_CORE_PROC, 12, -1}, + {13, 1, 1, 13, MAIN_CORE_PROC, 13, -1}, + {14, 1, 1, 14, MAIN_CORE_PROC, 14, -1}, + {15, 1, 1, 15, MAIN_CORE_PROC, 15, -1}, + }, + {"0300000030000000000000000000000000000000000000000000000000000100ff00000000000000000000000000000000000000300000000" + "00000000000000000000000000000000000000000000100010000000000000000000000000000000200000038000000010840000080000002" + "0000000000000000000000000000000000000000000000ff00000000000000000000000000000002000000380000000108400000800000010" + "000000000000000000000000000000000000000000000ff000000000000000000000000000000020000003800000002044000000004000000" + "00000000000000000000000000000000000000000000ff0000000000000000000000000000000200000038000000031040000000000100000" + "0000000000000000000000000000000000000000000ff00000000000000000000000000000000000000300000000000000000000000000000" + "00000000000000000000000100020000000000000000000000000000000000000030000000000000000000000000000000000000000000000" + "00000010004000000000000000000000000000000000000003000000000000000000000000000000000000000000000000000010008000000" + "00000000000000000000000000000000300000000000000000000000000000000000000000000000000001001000000000000000000000000" + "00000000000000030000000000000000000000000000000000000000000000000000100200000000000000000000000000000000000000030" + "00000000000000000000000000000000000000000000000000010040000000000000000000000000000000000000003000000000000000000" + "00000000000000000000000000000000001008000000000000000000000000000000003000000300000000000000000000000000000000000" + "0000000000000000010000ff00000000000000000000000000000000000030000000000000000000000000000000000000000000000000000" + "10000010000000000000000000000000000020000003800000001084000008000000200000000000000000000000000000000000000000000" + "0000ff00000000000000000000000000000200000038000000010840000080000001000000000000000000000000000000000000000000000" + "000ff000000000000000000000000000002000000380000000204400000000400000000000000000000000000000000000000000000000000" + "00ff0000000000000000000000000000020000003800000003104000000000010000000000000000000000000000000000000000000000000" + "0ff00000000000000000000000000000000000030000000000000000000000000000000000000000000000000000100000200000000000000" + "00000000000000000000003000000000000000000000000000000000000000000000000000010000040000000000000000000000000000000" + "00000300000000000000000000000000000000000000000000000000001000008000000000000000000000000000000000000300000000000" + "00000000000000000000000000000000000000000100001000000000000000000000000000000000000030000000000000000000000000000" + "00000000000000000000000010000200000000000000000000000000000000000003000000000000000000000000000000000000000000000" + "00000001000040000000000000000000000000000000000000300000000000000000000000000000000000000000000000000001000080000" + "00000000000000000000000000100000030000000000000000000000000000000000000000000000000000000ffff00000000000000000000" + "00000000040000005000000001000100000000000000000000000000000000000000000010100000000000000000000000000000000000000" + "000000000000000000000000000000000000000ffff000000000000"}, +}; + WinCpuMapTestCase _1sockets_24cores_hyperthreading_set1 = { 32, 1, @@ -2328,6 +2381,7 @@ INSTANTIATE_TEST_SUITE_P(CPUMap, _2sockets_48cores_hyperthreading, _2sockets_36cores_hyperthreading, _2sockets_48cores, + _2sockets_16cores, _1sockets_24cores_hyperthreading_set1, _1sockets_24cores_hyperthreading_set2, _1sockets_22cores_hyperthreading, diff --git a/src/plugins/auto/tests/functional/behavior/caching_test.cpp b/src/plugins/auto/tests/functional/behavior/caching_test.cpp index 1ef107cd59991f..196d2519250a5d 100644 --- a/src/plugins/auto/tests/functional/behavior/caching_test.cpp +++ b/src/plugins/auto/tests/functional/behavior/caching_test.cpp @@ -190,4 +190,4 @@ TEST_F(AutoFuncTests, compiled_with_cache_enabled_batch_enabled) { ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 5); core.set_property(ov::cache_dir("")); #endif -} \ No newline at end of file +} diff --git a/src/plugins/auto_batch/src/plugin.hpp b/src/plugins/auto_batch/src/plugin.hpp index 37a777cc970b6a..563ba4487ee3ec 100644 --- a/src/plugins/auto_batch/src/plugin.hpp +++ b/src/plugins/auto_batch/src/plugin.hpp @@ -68,4 +68,4 @@ class Plugin : public ov::IPlugin { mutable ov::AnyMap m_plugin_config; }; } // namespace autobatch_plugin -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 04909c7d8f5a5a..aa6ce49a051e00 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -242,7 +242,8 @@ ov_add_plugin(NAME ${TARGET_NAME} DEVICE_NAME "CPU" AS_EXTENSION VERSION_DEFINES_FOR src/plugin.cpp - SOURCES ${SOURCES} ${HEADERS}) + SOURCES ${SOURCES} ${HEADERS} + ADD_CLANG_FORMAT) # give a different file name depending on target platform architecture if(ARM OR AARCH64) @@ -277,6 +278,24 @@ target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $) +# ARCH lists for softmax.cpp and mha_single_token.cpp +# Based on result of above calls, decide whether to add SVE +set(SOFTMAX_ARCH_LIST AVX512F AVX2) +set(MHA_SINGLE_TOKEN_ARCH_LIST AVX512F AVX2) + +if(ENABLE_NEON_FP16) + list(APPEND SOFTMAX_ARCH_LIST NEON_FP16) + list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST NEON_FP16) +endif() + +if(ENABLE_SVE) + list(APPEND SOFTMAX_ARCH_LIST SVE) + list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST SVE) +endif() + +list(APPEND SOFTMAX_ARCH_LIST ANY) +list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST ANY) + # Cross compiled function # TODO: The same for proposal, proposalONNX, topk cross_compiled_file(${TARGET_NAME} @@ -287,14 +306,14 @@ cross_compiled_file(${TARGET_NAME} NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 NEON_FP16 ANY + ARCH ${SOFTMAX_ARCH_LIST} src/nodes/kernels/scaled_attn/softmax.cpp API src/nodes/kernels/scaled_attn/softmax.hpp NAME attn_softmax NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 NEON_FP16 ANY + ARCH ${MHA_SINGLE_TOKEN_ARCH_LIST} src/nodes/kernels/scaled_attn/mha_single_token.cpp API src/nodes/kernels/scaled_attn/mha_single_token.hpp NAME mha_single_token diff --git a/src/plugins/intel_cpu/src/cache/cache_entry.h b/src/plugins/intel_cpu/src/cache/cache_entry.h index 135a1090a60045..6e71e207b0a71c 100644 --- a/src/plugins/intel_cpu/src/cache/cache_entry.h +++ b/src/plugins/intel_cpu/src/cache/cache_entry.h @@ -4,8 +4,9 @@ #pragma once -#include #include +#include + #include "lru_cache.h" namespace ov { @@ -13,27 +14,24 @@ namespace intel_cpu { class CacheEntryBase { public: - enum class LookUpStatus : int8_t { - Hit, - Miss - }; + enum class LookUpStatus : int8_t { Hit, Miss }; + public: virtual ~CacheEntryBase() = default; }; /** * @brief Class represents a templated record in multi cache - * @tparam KeyType is a key type that must define hash() const method with return type convertible to size_t and define comparison operator. + * @tparam KeyType is a key type that must define hash() const method with return type convertible to size_t and define + * comparison operator. * @tparam ValType is a type that must meet all the requirements to the std::unordered_map mapped type - * @tparam ImplType is a type for the internal storage. It must provide put(KeyType, ValueType) and ValueType get(const KeyType&) - * interface and must have constructor of type ImplType(size_t). + * @tparam ImplType is a type for the internal storage. It must provide put(KeyType, ValueType) and ValueType get(const + * KeyType&) interface and must have constructor of type ImplType(size_t). * * @note In this implementation default constructed value objects are treated as empty objects. */ -template> +template > class CacheEntry : public CacheEntryBase { public: using ResultType = std::pair; @@ -42,11 +40,12 @@ class CacheEntry : public CacheEntryBase { explicit CacheEntry(size_t capacity) : _impl(capacity) {} /** - * @brief Searches the key in the underlying storage and returns value if it exists, or creates a value using the builder functor and adds it to - * the underlying storage. + * @brief Searches the key in the underlying storage and returns value if it exists, or creates a value using the + * builder functor and adds it to the underlying storage. * @param key is the search key * @param builder is a callable object that creates the ValType object from the KeyType lval reference - * @return result of the operation which is a pair of the requested object of ValType and the status of whether the cache hit or miss occurred + * @return result of the operation which is a pair of the requested object of ValType and the status of whether the + * cache hit or miss occurred */ ResultType getOrCreate(const KeyType& key, std::function builder) { @@ -70,5 +69,5 @@ class CacheEntry : public CacheEntryBase { ImplType _impl; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cache/lru_cache.h b/src/plugins/intel_cpu/src/cache/lru_cache.h index 792451da16c484..c3a4d47aa9de9f 100644 --- a/src/plugins/intel_cpu/src/cache/lru_cache.h +++ b/src/plugins/intel_cpu/src/cache/lru_cache.h @@ -10,7 +10,8 @@ /** * @brief This is yet another implementation of a preemptive cache with LRU eviction policy. - * @tparam Key is a key type that must define hash() const method with return type convertible to size_t and define comparison operator. + * @tparam Key is a key type that must define hash() const method with return type convertible to size_t and define + * comparison operator. * @tparam Value is a type that must meet all the requirements to the std::unordered_map mapped type * * @attention This cache implementation IS NOT THREAD SAFE! @@ -19,7 +20,7 @@ namespace ov { namespace intel_cpu { -template +template class LruCache { public: using value_type = std::pair; @@ -33,7 +34,7 @@ class LruCache { * @param value */ - void put(const Key &key, const Value &val) { + void put(const Key& key, const Value& val) { if (0 == _capacity) { return; } @@ -56,7 +57,7 @@ class LruCache { * @return Value associated with the key or default constructed instance of the Value type. */ - Value get(const Key &key) { + Value get(const Key& key) { auto itr = _cacheMapper.find(key); if (itr == _cacheMapper.end()) { return Value(); @@ -82,13 +83,13 @@ class LruCache { * @brief Returns the current capacity value * @return the current capacity value */ - size_t getCapacity() const noexcept { - return _capacity; - } + size_t getCapacity() const noexcept { + return _capacity; + } private: struct key_hasher { - std::size_t operator()(const Key &k) const { + std::size_t operator()(const Key& k) const { return k.hash(); } }; @@ -105,5 +106,5 @@ class LruCache { size_t _capacity; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cache/multi_cache.cpp b/src/plugins/intel_cpu/src/cache/multi_cache.cpp index 29dad18a41c770..325dfb517831b5 100644 --- a/src/plugins/intel_cpu/src/cache/multi_cache.cpp +++ b/src/plugins/intel_cpu/src/cache/multi_cache.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { std::atomic_size_t MultiCache::_typeIdCounter{0}; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cache/multi_cache.h b/src/plugins/intel_cpu/src/cache/multi_cache.h index d9b6e5f8bfe19a..e216efe6fea801 100644 --- a/src/plugins/intel_cpu/src/cache/multi_cache.h +++ b/src/plugins/intel_cpu/src/cache/multi_cache.h @@ -4,9 +4,10 @@ #pragma once +#include #include #include -#include + #include "cache_entry.h" namespace ov { @@ -20,27 +21,28 @@ namespace intel_cpu { class MultiCache { public: - template + template using EntryTypeT = CacheEntry; using EntryBasePtr = std::shared_ptr; - template + template using EntryPtr = std::shared_ptr>; public: /** - * @param capacity here means maximum records limit FOR EACH entry specified by a pair of Key/Value types. - * @note zero capacity means empty cache so no records are stored and no entries are created - */ + * @param capacity here means maximum records limit FOR EACH entry specified by a pair of Key/Value types. + * @note zero capacity means empty cache so no records are stored and no entries are created + */ explicit MultiCache(size_t capacity) : _capacity(capacity) {} /** - * @brief Searches a value of ValueType in the cache using the provided key or creates a new ValueType instance (if nothing was found) - * using the key and the builder functor and adds the new record to the cache - * @param key is the search key - * @param builder is a callable object that creates the ValType object from the KeyType lval reference. - * Also the builder type is used for the ValueType deduction - * @return result of the operation which is a pair of the requested object of ValType and the status of whether the cache hit or miss occurred - */ + * @brief Searches a value of ValueType in the cache using the provided key or creates a new ValueType instance (if + * nothing was found) using the key and the builder functor and adds the new record to the cache + * @param key is the search key + * @param builder is a callable object that creates the ValType object from the KeyType lval reference. + * Also the builder type is used for the ValueType deduction + * @return result of the operation which is a pair of the requested object of ValType and the status of whether the + * cache hit or miss occurred + */ template 201703L)) || (defined(__cplusplus) && (__cplusplus > 201703L)) @@ -54,9 +56,9 @@ class MultiCache { } private: - template + template size_t getTypeId(); - template + template EntryPtr getEntry(); private: @@ -65,13 +67,13 @@ class MultiCache { std::unordered_map _storage; }; -template +template size_t MultiCache::getTypeId() { static size_t id = _typeIdCounter.fetch_add(1); return id; } -template +template MultiCache::EntryPtr MultiCache::getEntry() { using EntryType = EntryTypeT; size_t id = getTypeId(); @@ -88,5 +90,5 @@ using MultiCacheWeakCPtr = std::weak_ptr; using MultiCachePtr = std::shared_ptr; using MultiCacheCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index bbee5d937be5d5..f81c7dbbced99d 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -3,29 +3,30 @@ // #include "compiled_model.h" + +#include +#include + #include "async_infer_request.h" +#include "cpu/x64/cpu_isa_traits.hpp" #include "infer_request.h" #include "itt.h" #include "low_precision/low_precision.hpp" #include "memory_state.h" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" -#include "openvino/runtime/threading/executor_manager.hpp" -#include "transformations/transformation_pipeline.h" #include "openvino/runtime/properties.hpp" -#include "openvino/util/common_util.hpp" +#include "openvino/runtime/threading/cpu_message.hpp" #include "openvino/runtime/threading/cpu_streams_executor.hpp" -#include "transformations/utils/utils.hpp" #include "openvino/runtime/threading/cpu_streams_info.hpp" -#include "openvino/runtime/threading/cpu_message.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" +#include "openvino/util/common_util.hpp" +#include "transformations/transformation_pipeline.h" +#include "transformations/utils/utils.hpp" #include "utils/serialize.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" -#include -#include - #if defined(OV_CPU_WITH_ACL) -#include "nodes/executors/acl/acl_ie_scheduler.hpp" +# include "nodes/executors/acl/acl_ie_scheduler.hpp" #endif using namespace ov::threading; @@ -183,7 +184,6 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const { } std::shared_ptr CompiledModel::create_sync_infer_request() const { - m_numRequests++; return std::make_shared(std::static_pointer_cast(shared_from_this())); } @@ -329,8 +329,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const { return decltype(ov::intel_cpu::sparse_weights_decompression_rate)::value_type( config.fcSparseWeiDecompressionRate); } else if (name == ov::hint::dynamic_quantization_group_size) { - return decltype(ov::hint::dynamic_quantization_group_size)::value_type( - config.fcDynamicQuantizationGroupSize); + return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize); } else if (name == ov::hint::kv_cache_precision) { return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision); } @@ -344,8 +343,12 @@ void CompiledModel::export_model(std::ostream& modelStream) const { void CompiledModel::release_memory() { for (auto&& graph : m_graphs) { - GraphGuard::Lock graph_lock{graph}; - auto ctx = graph_lock._graph.getGraphContext(); + // try to lock mutex, since it may be already locked (e.g by an infer request) + std::unique_lock lock(graph._mutex, std::try_to_lock); + OPENVINO_ASSERT(lock.owns_lock(), + "Attempt to call release_memory() on a compiled model in a busy state. Please ensure that all " + "infer requests are completed before releasing memory."); + auto ctx = graph.getGraphContext(); ctx->getNetworkMemoryControl()->releaseMemory(); } } diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h index faedf1ae5a744c..f7d2903b0526cf 100644 --- a/src/plugins/intel_cpu/src/compiled_model.h +++ b/src/plugins/intel_cpu/src/compiled_model.h @@ -20,6 +20,15 @@ namespace ov { namespace intel_cpu { class CompiledModel : public ov::ICompiledModel { +public: + struct GraphGuard : public Graph { + std::mutex _mutex; + struct Lock : public std::unique_lock { + explicit Lock(GraphGuard& graph) : std::unique_lock(graph._mutex), _graph(graph) {} + GraphGuard& _graph; + }; + }; + public: typedef std::shared_ptr Ptr; @@ -51,9 +60,13 @@ class CompiledModel : public ov::ICompiledModel { void release_memory() override; + std::string name() const { + return m_name; + } + private: std::shared_ptr create_sync_infer_request() const override; - friend class SyncInferRequest; + friend class CompiledModelHolder; const std::shared_ptr m_model; const std::shared_ptr m_plugin; @@ -66,13 +79,6 @@ class CompiledModel : public ov::ICompiledModel { Config m_cfg; mutable std::atomic_int m_numRequests = {0}; std::string m_name; - struct GraphGuard : public Graph { - std::mutex _mutex; - struct Lock : public std::unique_lock { - explicit Lock(GraphGuard& graph) : std::unique_lock(graph._mutex), _graph(graph) {} - GraphGuard& _graph; - }; - }; const bool m_loaded_from_cache; // WARNING: Do not use m_graphs directly. @@ -94,5 +100,59 @@ class CompiledModel : public ov::ICompiledModel { bool m_has_sub_compiled_models = false; }; -} // namespace intel_cpu -} // namespace ov +// This class provides safe access to the internal CompiledModel structures and helps to decouple SyncInferRequest and +// the CompiledModel internal structures +class CompiledModelHolder { +public: + CompiledModelHolder(std::shared_ptr compiled_model) + : m_compiled_model(std::move(compiled_model)) { + OPENVINO_ASSERT(!m_compiled_model->m_graphs.empty(), + "No graph was found in the compiled model: ", + m_compiled_model->name()); + m_graph = &(m_compiled_model->get_graph()._graph); + m_id = (m_compiled_model->m_numRequests)++; + } + + ~CompiledModelHolder() { + if (m_compiled_model) { + --(m_compiled_model->m_numRequests); + } + } + + CompiledModelHolder(const CompiledModelHolder&) = delete; + CompiledModelHolder& operator=(const CompiledModelHolder&) = delete; + + CompiledModelHolder(CompiledModelHolder&&) = default; + CompiledModelHolder& operator=(CompiledModelHolder&&) = default; + + const Graph& graph() const { + return *m_graph; + } + + CompiledModel::GraphGuard::Lock lock() { + auto lock = m_compiled_model->get_graph(); + m_graph = &(lock._graph); + OPENVINO_ASSERT(m_graph, "Graph ptr null check failed"); + return lock; + } + + std::string name() const { + return m_compiled_model->name(); + } + + std::shared_ptr compiled_model() const { + return m_compiled_model; + } + + int id() const { + return m_id; + } + +private: + std::shared_ptr m_compiled_model; + const Graph* m_graph; + int m_id; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 7ce4c1069e695d..7d1ee05897e81d 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -4,19 +4,19 @@ #include "config.h" +#include +#include +#include + #include "cpu/x64/cpu_isa_traits.hpp" #include "openvino/core/parallel.hpp" #include "openvino/core/type/element_type_traits.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" #include "openvino/runtime/internal_properties.hpp" #include "openvino/runtime/properties.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/precision_support.h" -#include "utils/cpu_utils.hpp" - -#include -#include -#include namespace ov { namespace intel_cpu { @@ -61,9 +61,7 @@ Config::Config() { */ void Config::applyDebugCapsProperties() { // always enable perf counters for verbose, performance summary and average counters - if (!debugCaps.verbose.empty() || - !debugCaps.summaryPerf.empty() || - !debugCaps.averageCountersPath.empty()) { + if (!debugCaps.verbose.empty() || !debugCaps.summaryPerf.empty() || !debugCaps.averageCountersPath.empty()) { collectPerfCounters = true; } } @@ -151,10 +149,10 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { logLevel = val.as(); } catch (const ov::Exception&) { OPENVINO_THROW("Wrong value ", - val.as(), - " for property key ", - key, - ". Expected only ov::log::Level::NO/ERR/WARNING/INFO/DEBUG/TRACE."); + val.as(), + " for property key ", + key, + ". Expected only ov::log::Level::NO/ERR/WARNING/INFO/DEBUG/TRACE."); } } else if (key == ov::hint::num_requests.name()) { try { @@ -243,8 +241,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { fcDynamicQuantizationGroupSize = val.as(); } catch (const ov::Exception&) { OPENVINO_THROW("Wrong value for property key ", - ov::hint::dynamic_quantization_group_size.name(), - ". Expected only unsinged integer numbers"); + ov::hint::dynamic_quantization_group_size.name(), + ". Expected only unsinged integer numbers"); } } else if (key == ov::enable_profiling.name()) { try { @@ -366,7 +364,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) { kvCachePrecision = prec; } else { - OPENVINO_THROW("invalid value"); + OPENVINO_THROW("invalid value"); } } catch (ov::Exception&) { OPENVINO_THROW("Wrong value ", @@ -460,5 +458,19 @@ void Config::updateProperties() { _config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)}); } +void Config::applyRtInfo(const std::shared_ptr& model) { + // if user sets explicitly, it will be higher priority than rt_info + if (!kvCachePrecisionSetExplicitly && + model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) { + this->kvCachePrecision = + model->get_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()}); + } + if (!fcDynamicQuantizationGroupSizeSetExplicitly && + model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) { + this->fcDynamicQuantizationGroupSize = + model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 5f4bb25ede350e..1aa08f4412f0b3 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -4,18 +4,17 @@ #pragma once +#include +#include +#include + +#include "internal_properties.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" #include "openvino/util/common_util.hpp" - -#include "internal_properties.hpp" #include "utils/debug_caps_config.h" -#include -#include -#include - namespace ov { namespace intel_cpu { struct Config { @@ -38,11 +37,7 @@ struct Config { Disable, }; - enum class ModelType { - CNN, - LLM, - Unknown - }; + enum class ModelType { CNN, LLM, Unknown }; bool collectPerfCounters = false; bool exclusiveAsyncRequests = false; @@ -69,7 +64,8 @@ struct Config { bool streamsChanged = false; int threads = 0; int threadsPerStream = 0; - ov::threading::IStreamsExecutor::ThreadBindingType threadBindingType = ov::threading::IStreamsExecutor::ThreadBindingType::NONE; + ov::threading::IStreamsExecutor::ThreadBindingType threadBindingType = + ov::threading::IStreamsExecutor::ThreadBindingType::NONE; ov::hint::PerformanceMode hintPerfMode = ov::hint::PerformanceMode::LATENCY; std::vector> streamsRankTable; bool changedHintPerfMode = false; @@ -106,6 +102,8 @@ struct Config { void updateProperties(); + void applyRtInfo(const std::shared_ptr& model); + std::map _config; int modelPreferThreads = -1; @@ -120,4 +118,4 @@ struct Config { }; } // namespace intel_cpu -} // namespace ov +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp index 8e5fe8d72fd1f2..7cb4abc2161f14 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.cpp +++ b/src/plugins/intel_cpu/src/cpu_memory.cpp @@ -3,14 +3,17 @@ // #include "cpu_memory.h" -#include "memory_desc/cpu_memory_desc_utils.h" + #include -#include "nodes/reorder.h" + +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/common/cpu_memcpy.h" +#include "nodes/reorder.h" #include "utils/debug_capabilities.h" #if defined(__linux__) # include /* Definition of SYS_* constants */ # include + # include /* strerror(errno) */ #endif @@ -27,69 +30,72 @@ BlockedMemoryDescPtr IMemory::getDescWithType() const { } namespace { - inline void setSubnormalsToZero(float *data, size_t size) { - uint32_t *u32data = reinterpret_cast(data); - for (size_t i = 0; i < size; ++i) { - if ((u32data[i] & (0xFF << 23)) == 0) { - u32data[i] = 0; - } +inline void setSubnormalsToZero(float* data, size_t size) { + uint32_t* u32data = reinterpret_cast(data); + for (size_t i = 0; i < size; ++i) { + if ((u32data[i] & (0xFF << 23)) == 0) { + u32data[i] = 0; } } +} - void transferData(const IMemory& src, const IMemory& dst, bool ftz) { - node::Reorder::reorderData(src, dst); +void transferData(const IMemory& src, const IMemory& dst, bool ftz) { + node::Reorder::reorderData(src, dst); - if (!ftz) { - return; - } - if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() == ov::element::bf16) { + if (!ftz) { + return; + } + if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() == ov::element::bf16) { + return; + } + size_t offset = 0; + if (dst.getDesc().getType() & MemoryDescType::Dnnl) { + // here we can safely cast to DnnlMemoryDesc + auto dnnl_desc = dst.getDescWithType(); + auto desc = dnnl_desc->getDnnlDesc(); + dnnl::impl::memory_desc_wrapper wrapper(desc.get()); + offset = wrapper.offset0(); + if (wrapper.is_wino_desc() || wrapper.is_rnn_packed_desc()) { return; } - size_t offset = 0; - if (dst.getDesc().getType() & MemoryDescType::Dnnl) { - // here we can safely cast to DnnlMemoryDesc - auto dnnl_desc = dst.getDescWithType(); - auto desc = dnnl_desc->getDnnlDesc(); - dnnl::impl::memory_desc_wrapper wrapper(desc.get()); - offset = wrapper.offset0(); - if (wrapper.is_wino_desc() || wrapper.is_rnn_packed_desc()) { - return; - } - } - // actual FTZ - auto* memData = static_cast(dst.getData()); - memData += offset; - setSubnormalsToZero(memData, dst.getSize() / sizeof(float)); } + // actual FTZ + auto* memData = static_cast(dst.getData()); + memData += offset; + setSubnormalsToZero(memData, dst.getSize() / sizeof(float)); +} -} // namespace +} // namespace -Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) : - m_eng(eng), - m_pMemDesc(desc), - m_blockHandle(std::make_shared(make_unique()), this), - dnnlMemHandle(this) { - if (desc->getPrecision() == element::string) { - OPENVINO_THROW("[CPU] Memory object cannot be created for string data."); - } - create(m_pMemDesc, data, pads_zeroing); +Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) + : m_eng(eng), + m_pMemDesc(desc), + m_blockHandle(std::make_shared(make_unique()), this), + dnnlMemHandle(this) { + if (desc->getPrecision() == element::string) { + OPENVINO_THROW("[CPU] Memory object cannot be created for string data."); } + create(m_pMemDesc, data, pads_zeroing); +} -Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) : - Memory::Memory(eng, desc.clone(), data, pads_zeroing) {} - -Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, MemoryBlockPtr block) : - m_eng(eng), m_pMemDesc(desc), m_blockHandle(block, this), dnnlMemHandle(this) { - if (desc->getPrecision() == element::string) { - OPENVINO_THROW("[CPU] Memory object can't be created for string data."); - } - bool memAllocated = m_blockHandle->getRawPtr(); +Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) + : Memory::Memory(eng, desc.clone(), data, pads_zeroing) {} - create(desc, nullptr, !memAllocated); +Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, MemoryBlockPtr block) + : m_eng(eng), + m_pMemDesc(desc), + m_blockHandle(block, this), + dnnlMemHandle(this) { + if (desc->getPrecision() == element::string) { + OPENVINO_THROW("[CPU] Memory object can't be created for string data."); } + bool memAllocated = m_blockHandle->getRawPtr(); -Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, MemoryBlockPtr block) : - Memory::Memory(eng, desc.clone(), block) {} + create(desc, nullptr, !memAllocated); +} + +Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, MemoryBlockPtr block) + : Memory::Memory(eng, desc.clone(), block) {} size_t Memory::getSize() const { auto size = getDesc().getCurrentMemSize(); @@ -99,7 +105,7 @@ size_t Memory::getSize() const { return size; } -void Memory::create(const MemoryDesc &desc, const void *data, bool pads_zeroing) { +void Memory::create(const MemoryDesc& desc, const void* data, bool pads_zeroing) { create(desc.clone(), data, pads_zeroing); } @@ -187,9 +193,7 @@ dnnl::memory Memory::DnnlMemPrimHandle::getPrim() const { void* Memory::getData() const { void* data = getDataNoThrow(); - if (data == nullptr && - m_pMemDesc->getShape().isStatic() && - m_pMemDesc->getShape().getElementsCount() != 0) + if (data == nullptr && m_pMemDesc->getShape().isStatic() && m_pMemDesc->getShape().getElementsCount() != 0) OPENVINO_THROW("Memory has not been allocated"); return data; } @@ -198,7 +202,7 @@ void* MemoryBlockWithReuse::getRawPtr() const noexcept { return m_data.get(); } -void MemoryBlockWithReuse::setExtBuff(void *ptr, size_t size) { +void MemoryBlockWithReuse::setExtBuff(void* ptr, size_t size) { m_useExternalStorage = true; m_memUpperBound = size; m_data = decltype(m_data)(ptr, release); @@ -208,7 +212,7 @@ bool MemoryBlockWithReuse::resize(size_t size) { constexpr int cacheLineSize = 64; bool sizeChanged = false; if (size > m_memUpperBound) { - void *ptr = dnnl::impl::malloc(size, cacheLineSize); + void* ptr = dnnl::impl::malloc(size, cacheLineSize); if (!ptr) { OPENVINO_THROW("Failed to allocate ", size, " bytes of memory"); } @@ -236,15 +240,17 @@ void MemoryBlockWithReuse::free() { m_useExternalStorage = false; } -void MemoryBlockWithReuse::release(void *ptr) {} +void MemoryBlockWithReuse::release(void* ptr) {} -void MemoryBlockWithReuse::destroy(void *ptr) { +void MemoryBlockWithReuse::destroy(void* ptr) { dnnl::impl::free(ptr); } /////////////// StringMemory /////////////// -StringMemory::StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc, const void* data) : m_engine(engine), m_mem_desc(desc) { +StringMemory::StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc, const void* data) + : m_engine(engine), + m_mem_desc(desc) { if (m_mem_desc->getPrecision() != element::string) { OPENVINO_THROW("[CPU] StringMemory supports String type only."); } @@ -258,8 +264,8 @@ StringMemory::StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc const auto string_size = m_mem_desc->getShape().getElementsCount(); if (data != nullptr) { - auto not_const_data = const_cast(data); - m_memoryBlock->setExtBuff(reinterpret_cast(not_const_data), string_size); + auto not_const_data = const_cast(data); + m_memoryBlock->setExtBuff(reinterpret_cast(not_const_data), string_size); } else { m_memoryBlock->resize(string_size); } @@ -273,7 +279,7 @@ void StringMemory::load(const IMemory& src, bool ftz) const { transferData(src, *this, false); } -void* StringMemory::getData() const { +void* StringMemory::getData() const { return m_memoryBlock->getRawPtr(); } @@ -297,7 +303,7 @@ void StringMemory::nullify() { } } -size_t StringMemory::getSize() const { // In bytes +size_t StringMemory::getSize() const { // In bytes auto size = getDesc().getCurrentMemSize(); if (size == MemoryDesc::UNDEFINED_SIZE) { OPENVINO_THROW("Can't get memory size for undefined shape."); @@ -329,7 +335,7 @@ bool StringMemory::StringMemoryBlock::resize(size_t size) { if (size > PTRDIFF_MAX) { OPENVINO_THROW("Requested allocation size { ", size, " } exceeds PTRDIFF_MAX."); } - auto ptr_size = static_cast(size); // WA for warning alloc-size-larger-than + auto ptr_size = static_cast(size); // WA for warning alloc-size-larger-than auto ptr = new OvString[ptr_size]; if (!ptr) { OPENVINO_THROW("Failed to allocate ", size, " bytes of memory"); @@ -355,7 +361,7 @@ void StringMemory::StringMemoryBlock::destroy(OvString* ptr) { } void* StringMemory::StringMemoryBlock::getRawPtr() const noexcept { - return reinterpret_cast(m_data.get()); + return reinterpret_cast(m_data.get()); } /////////////// DnnlMemoryBlock /////////////// @@ -364,7 +370,7 @@ void* DnnlMemoryBlock::getRawPtr() const noexcept { return m_pMemBlock->getRawPtr(); } -void DnnlMemoryBlock::setExtBuff(void *ptr, size_t size) { +void DnnlMemoryBlock::setExtBuff(void* ptr, size_t size) { m_pMemBlock->setExtBuff(ptr, size); notifyUpdate(); } @@ -401,8 +407,9 @@ void DnnlMemoryBlock::notifyUpdate() { } } -StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) : - m_eng(eng), m_pMemDesc(desc) { +StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) + : m_eng(eng), + m_pMemDesc(desc) { if (desc->getPrecision() == element::string) { OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data."); } @@ -427,14 +434,13 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo // // ======================== m_prim.set_data_handle(m_pMemBlock->getRawPtr()); - } - catch (const std::exception& exc) { + } catch (const std::exception& exc) { dnnlErrorCtx = exc.what(); } } -StaticMemory::StaticMemory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) : - StaticMemory::StaticMemory(eng, desc.clone(), data, pads_zeroing) {} +StaticMemory::StaticMemory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) + : StaticMemory::StaticMemory(eng, desc.clone(), data, pads_zeroing) {} const MemoryDesc& StaticMemory::getDesc() const { return *m_pMemDesc; @@ -475,7 +481,7 @@ MemoryBlockPtr StaticMemory::getMemoryBlock() const { return m_pMemBlock; } -//oneDNN specifics for backward compatibility +// oneDNN specifics for backward compatibility dnnl::memory StaticMemory::getPrimitive() const { if (!m_prim) { OPENVINO_THROW("Couldn't create dnnl::memory object: ", dnnlErrorCtx); @@ -517,11 +523,11 @@ bool StaticMemory::StaticMemoryBlock::hasExtBuffer() const noexcept { } void StaticMemory::StaticMemoryBlock::registerMemory(Memory* memPtr) { - //do nothing + // do nothing } void StaticMemory::StaticMemoryBlock::unregisterMemory(Memory* memPtr) { - //do nothing + // do nothing } #if defined(__linux__) @@ -529,9 +535,9 @@ void StaticMemory::StaticMemoryBlock::unregisterMemory(Memory* memPtr) { # define MPOL_BIND 2 # define MPOL_MF_STRICT (1 << 0) # define MPOL_MF_MOVE (1 << 1) -#if !defined(__NR_mbind) && defined(__x86_64__) -# define __NR_mbind 237 -#endif +# if !defined(__NR_mbind) && defined(__x86_64__) +# define __NR_mbind 237 +# endif static long mbind(void* start, unsigned long len, int mode, @@ -585,7 +591,12 @@ bool mbind_move(const dnnl::memory mem, int numaNodeID) { return mbind_move(data, size, numaNodeID); } -MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill) { +MemoryPtr split_horizontal(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill) { auto desc = src->getDescPtr(); auto shape = src->getShape(); auto dims = shape.getDims(); @@ -620,7 +631,9 @@ MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim // reference stride VectorDims stride_dims = dims; stride_dims[dim] = splited_dim_vec[0]; - size_t stride = std::accumulate(stride_dims.begin(), stride_dims.end(), static_cast(1), std::multiplies()) * prec.size(); + size_t stride = + std::accumulate(stride_dims.begin(), stride_dims.end(), static_cast(1), std::multiplies()) * + prec.size(); // create new shape for target memory VectorDims new_dims = dims; @@ -641,7 +654,12 @@ MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim return ptr; } -MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill) { +MemoryPtr split_vertical(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill) { auto desc = src->getDescPtr(); auto shape = src->getShape(); auto dims = shape.getDims(); @@ -697,7 +715,7 @@ MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, strideSize /= 2; copySize /= 2; } - parallel_for(step, [&](int i){ + parallel_for(step, [&](int i) { int dst_offset = i * copySize; int src_offset = i * splited_size + w_rank * strideSize; cpu_parallel_memcpy(dstPtr + dst_offset, srcPtr + src_offset, copySize); @@ -705,5 +723,5 @@ MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, return ptr; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_memory.h b/src/plugins/intel_cpu/src/cpu_memory.h index 70e6713e36b886..f6837064babfa6 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.h +++ b/src/plugins/intel_cpu/src/cpu_memory.h @@ -4,18 +4,18 @@ #pragma once -#include "memory_desc/cpu_memory_desc.h" -#include "dnnl_extension_utils.h" -#include #include - -#include "openvino/core/type/element_type.hpp" -#include "openvino/core/type/element_type_traits.hpp" +#include #include #include #include +#include "dnnl_extension_utils.h" +#include "memory_desc/cpu_memory_desc.h" +#include "openvino/core/type/element_type.hpp" +#include "openvino/core/type/element_type_traits.hpp" + /** * @file contains a concept classes to work with memory/tensor/blob abstractions on plugin level. * @@ -47,7 +47,8 @@ class IMemoryBlock { virtual void* getRawPtr() const noexcept = 0; /** - * @brief Allows to set externally allocated memory buffer. In that case, the object has no control over the provided memory. + * @brief Allows to set externally allocated memory buffer. In that case, the object has no control over the + * provided memory. * @param ptr - pointer to the memory * @param size - size of the memory buffer */ @@ -82,11 +83,11 @@ class MemoryBlockWithReuse : public IMemoryBlock { private: bool m_useExternalStorage = false; size_t m_memUpperBound = 0ul; - std::unique_ptr m_data; + std::unique_ptr m_data; int numa_node; - static void release(void *ptr); - static void destroy(void *ptr); + static void release(void* ptr); + static void destroy(void* ptr); }; class IMemoryBlockObserver : public IMemoryBlock { @@ -128,13 +129,13 @@ class DnnlMemBlockHandle { } DnnlMemBlockHandle(const DnnlMemBlockHandle&) = delete; - DnnlMemBlockHandle& operator= (const DnnlMemBlockHandle&) = delete; + DnnlMemBlockHandle& operator=(const DnnlMemBlockHandle&) = delete; DnnlMemBlockHandle(DnnlMemBlockHandle&& source) { std::swap(m_pMemBlock, source.m_pMemBlock); std::swap(m_pMem, source.m_pMem); } - DnnlMemBlockHandle& operator= (DnnlMemBlockHandle&& rhs) { + DnnlMemBlockHandle& operator=(DnnlMemBlockHandle&& rhs) { std::swap(m_pMemBlock, rhs.m_pMemBlock); std::swap(m_pMem, rhs.m_pMem); return *this; @@ -166,7 +167,7 @@ class IMemory { virtual const MemoryDesc& getDesc() const = 0; virtual MemoryDescPtr getDescPtr() const = 0; - virtual void* getData() const = 0; // pointer to the actual memory + virtual void* getData() const = 0; // pointer to the actual memory template ::type> T* getDataAs() const { @@ -177,7 +178,7 @@ class IMemory { return static_cast(getData()); } - virtual size_t getSize() const = 0; // in bytes + virtual size_t getSize() const = 0; // in bytes virtual const Shape& getShape() const = 0; virtual const VectorDims& getStaticDims() const = 0; @@ -199,7 +200,7 @@ class IMemory { return false; } - //oneDNN specifics for backward compatibility + // oneDNN specifics for backward compatibility virtual dnnl::memory getPrimitive() const = 0; ov::element::Type getPrecision() const { @@ -211,8 +212,8 @@ class IMemory { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> std::shared_ptr getDescWithType() const; }; @@ -241,17 +242,17 @@ class StaticMemory final : public IMemory { StaticMemory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data = nullptr, bool pads_zeroing = true); StaticMemory(const StaticMemory&) = delete; - StaticMemory& operator= (const StaticMemory&) = delete; + StaticMemory& operator=(const StaticMemory&) = delete; StaticMemory(Memory&&) = delete; - StaticMemory& operator= (StaticMemory&&) = delete; + StaticMemory& operator=(StaticMemory&&) = delete; const MemoryDesc& getDesc() const override; MemoryDescPtr getDescPtr() const override; - void* getData() const override; // pointer to the actual memory + void* getData() const override; // pointer to the actual memory - size_t getSize() const override; // in bytes + size_t getSize() const override; // in bytes const Shape& getShape() const override; const VectorDims& getStaticDims() const override; @@ -262,7 +263,7 @@ class StaticMemory final : public IMemory { MemoryBlockPtr getMemoryBlock() const override; - //oneDNN specifics for backward compatibility + // oneDNN specifics for backward compatibility dnnl::memory getPrimitive() const override; void nullify() override; @@ -284,10 +285,10 @@ class Memory : public IMemory { Memory(const dnnl::engine& eng, const MemoryDesc& desc, MemoryBlockPtr block); Memory(const Memory&) = delete; - Memory& operator= (const Memory&) = delete; + Memory& operator=(const Memory&) = delete; Memory(Memory&&) = delete; - Memory& operator= (Memory&&) = delete; + Memory& operator=(Memory&&) = delete; dnnl::memory getPrimitive() const override; @@ -341,7 +342,7 @@ class Memory : public IMemory { bool m_padsZeroing = true; class DnnlMemPrimHandle { public: - explicit DnnlMemPrimHandle(const Memory* memObjPtr): m_memObjPtr(memObjPtr) {} + explicit DnnlMemPrimHandle(const Memory* memObjPtr) : m_memObjPtr(memObjPtr) {} bool isInit() const; dnnl::memory getPrim() const; void resetDnnlPrim(); @@ -376,7 +377,7 @@ class StringMemory : public IMemory { private: bool m_use_external_storage = false; size_t m_str_upper_bound = 0lu; - std::unique_ptr m_data; + std::unique_ptr m_data; static void release(OvString* ptr) {} static void destroy(OvString* ptr); @@ -390,7 +391,9 @@ class StringMemory : public IMemory { : StringMemory(engine, desc.clone(), data) {} StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc, const StringMemoryBlockPtr& block) - : m_engine(engine), m_mem_desc(desc), m_memoryBlock(block) {} + : m_engine(engine), + m_mem_desc(desc), + m_memoryBlock(block) {} StringMemory(const dnnl::engine& engine, const MemoryDesc& desc, const StringMemoryBlockPtr& block) : StringMemory(engine, desc.clone(), block) {} @@ -405,7 +408,7 @@ class StringMemory : public IMemory { void* getData() const override; - size_t getSize() const override; // In bytes + size_t getSize() const override; // In bytes const Shape& getShape() const override { return m_mem_desc->getShape(); @@ -443,8 +446,18 @@ bool mbind_move(void* data, size_t size, int numaNodeID); bool mbind_move(const MemoryCPtr mem, int numaNodeID); bool mbind_move(const dnnl::memory mem, int numaNodeID); -MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill = true); -MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill = true); - -} // namespace intel_cpu -} // namespace ov +MemoryPtr split_horizontal(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill = true); +MemoryPtr split_vertical(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill = true); + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_shape.cpp b/src/plugins/intel_cpu/src/cpu_shape.cpp index 4c6b5793d9f2ef..2b7011af1a1f5e 100644 --- a/src/plugins/intel_cpu/src/cpu_shape.cpp +++ b/src/plugins/intel_cpu/src/cpu_shape.cpp @@ -3,12 +3,13 @@ // #include "cpu_shape.h" + #include "utils/general_utils.h" namespace ov { namespace intel_cpu { -bool Shape::isCompatible(const VectorDims &vecDims) const { +bool Shape::isCompatible(const VectorDims& vecDims) const { if (getRank() != vecDims.size()) { return false; } @@ -21,17 +22,21 @@ bool Shape::isCompatible(const VectorDims &vecDims) const { return false; } - if (!std::equal(getMaxDims().begin(), getMaxDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { return lhs >= rhs; })) { + if (!std::equal(getMaxDims().begin(), getMaxDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { + return lhs >= rhs; + })) { return false; } - if (!std::equal(getMinDims().begin(), getMinDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { return lhs <= rhs; })) { + if (!std::equal(getMinDims().begin(), getMinDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { + return lhs <= rhs; + })) { return false; } return true; } -std::string Shape::toString() const { +std::string Shape::toString() const { std::stringstream output; output << "{"; @@ -50,10 +55,10 @@ std::string Shape::toString() const { Shape mergeShapes(const Shape& lhs, const Shape& rhs) { OPENVINO_ASSERT(lhs.getRank() == rhs.getRank(), - "Couldn't merge shapes of different ranks: shape 1:", - lhs.toString(), - " shape 2: ", - rhs.toString()); + "Couldn't merge shapes of different ranks: shape 1:", + lhs.toString(), + " shape 2: ", + rhs.toString()); const auto& lhsMinDims = lhs.getMinDims(); const auto& lhsMaxDims = lhs.getMaxDims(); @@ -66,10 +71,11 @@ Shape mergeShapes(const Shape& lhs, const Shape& rhs) { for (size_t i = 0; i < resultMinDims.size(); ++i) { resultMinDims[i] = std::max(lhsMinDims[i], rhsMinDims[i]); resultMaxDims[i] = std::min(lhsMaxDims[i], rhsMaxDims[i]); - OPENVINO_ASSERT(resultMinDims[i] <= resultMaxDims[i], "Couldn't merge shapes as the dims intervals are not overlapping."); + OPENVINO_ASSERT(resultMinDims[i] <= resultMaxDims[i], + "Couldn't merge shapes as the dims intervals are not overlapping."); } return Shape{resultMinDims, resultMaxDims}; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_shape.h b/src/plugins/intel_cpu/src/cpu_shape.h index a04b043689e520..f2895287e2f8fe 100644 --- a/src/plugins/intel_cpu/src/cpu_shape.h +++ b/src/plugins/intel_cpu/src/cpu_shape.h @@ -31,13 +31,17 @@ class Shape { type = shape.is_static() ? ShapeType::Static : ShapeType::Dynamic; initDims(); - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } explicit Shape(const VectorDims& shape) { dims = minDims = maxDims = shape; type = ShapeType::Static; - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } Shape(const VectorDims& minDims, const VectorDims& maxDims) { @@ -49,13 +53,17 @@ class Shape { initDims(); - if (std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == Shape::UNDEFINED_DIM; } )) { + if (std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == Shape::UNDEFINED_DIM; + })) { type = ShapeType::Dynamic; } else { type = ShapeType::Static; } - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } Shape(const std::initializer_list& shape) { @@ -69,7 +77,9 @@ class Shape { initDims(); - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } /** @@ -181,21 +191,21 @@ class Shape { std::string toString() const; - bool operator == (const Shape& rhs) const { + bool operator==(const Shape& rhs) const { return minDims == rhs.minDims && maxDims == rhs.maxDims; } - bool operator != (const Shape& rhs) const { + bool operator!=(const Shape& rhs) const { return !(*this == rhs); } bool hasDefinedUpperBounds() const { - return std::all_of(maxDims.begin(), maxDims.end(), [](Dim dim){ return dim != UNDEFINED_DIM; }); + return std::all_of(maxDims.begin(), maxDims.end(), [](Dim dim) { + return dim != UNDEFINED_DIM; + }); } - enum : Dim { - UNDEFINED_DIM = std::numeric_limits::max() - }; + enum : Dim { UNDEFINED_DIM = std::numeric_limits::max() }; private: void initDims() { @@ -205,10 +215,7 @@ class Shape { } } - enum class ShapeType { - Static, - Dynamic - } type {ShapeType::Static}; + enum class ShapeType { Static, Dynamic } type{ShapeType::Static}; bool hasZeroDimensions = false; @@ -229,5 +236,5 @@ class Shape { Shape mergeShapes(const Shape& lhs, const Shape& rhs); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp index 0ed64d49ea68dd..3af6a52d5f3342 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp @@ -4,6 +4,11 @@ #include "cpu_streams_calculation.hpp" +#include +#include +#include +#include + #include "cpu_map_scheduling.hpp" #include "graph.h" #include "openvino/op/fake_quantize.hpp" @@ -13,29 +18,25 @@ #include "transformations/utils.hpp" #include "transformations/utils/utils.hpp" -#include -#include -#include -#include - using namespace ov; using namespace ov::threading; -#define INIT_VAL -100 +#define INIT_VAL -100 #define TP_CPU_LIMIT 32 namespace ov { namespace intel_cpu { -std::vector> get_streams_info_table(const int input_streams, - const bool input_streams_changed, - const int input_threads, - const int input_infer_requests, - const int model_prefer_threads, - const int input_current_socket_id, - const std::string input_perf_hint, - const std::set hint_model_distribution_policy, - const std::vector>& proc_type_table) { +std::vector> get_streams_info_table( + const int input_streams, + const bool input_streams_changed, + const int input_threads, + const int input_infer_requests, + const int model_prefer_threads, + const int input_current_socket_id, + const std::string input_perf_hint, + const std::set hint_model_distribution_policy, + const std::vector>& proc_type_table) { std::vector stream_info(CPU_STREAMS_TABLE_SIZE, INIT_VAL); std::vector> streams_info_table; std::vector> proc_socket_table; @@ -204,13 +205,13 @@ std::vector> get_streams_info_table(const int input_streams, current_socket_id = input_current_socket_id == -1 ? get_current_socket_id() : input_current_socket_id; if (input_threads > 0) { if (hint_model_distribution_policy.size() == 0) { + n_threads_per_stream = std::min(input_threads, proc_type_table[0][ALL_PROC]); + } else { for (auto& row : proc_socket_table) { if (current_socket_id == row[PROC_SOCKET_ID]) { n_threads_per_stream = std::min(input_threads, row[ALL_PROC]); } } - } else { - n_threads_per_stream = std::min(input_threads, proc_type_table[0][ALL_PROC]); } if (proc_type_table.size() == 1) { if ((n_threads_per_stream > proc_type_table[0][MAIN_CORE_PROC]) && @@ -242,26 +243,32 @@ std::vector> get_streams_info_table(const int input_streams, n_threads_per_stream = proc_type_table[0][ALL_PROC]; } } else { - int numa_index = 1; + size_t socket_index = 0; + for (socket_index = 0; socket_index < proc_socket_table.size(); socket_index++) { + if (proc_socket_table[socket_index][PROC_SOCKET_ID] == current_socket_id) { + break; + } + } + const std::vector& current_socket_info = proc_socket_table[socket_index]; n_threads_per_stream = model_prefer_threads == 0 - ? proc_type_table[numa_index][ALL_PROC] - : std::min(proc_type_table[numa_index][ALL_PROC], model_prefer_threads); + ? current_socket_info[ALL_PROC] + : std::min(current_socket_info[ALL_PROC], model_prefer_threads); stream_info[THREADS_PER_STREAM] = n_threads_per_stream; - if (proc_type_table[numa_index][ALL_PROC] == proc_type_table[numa_index][MAIN_CORE_PROC]) { + if (current_socket_info[ALL_PROC] == current_socket_info[MAIN_CORE_PROC]) { stream_info[PROC_TYPE] = MAIN_CORE_PROC; - update_streams_per_node(MAIN_CORE_PROC, proc_type_table[numa_index]); - } else if (proc_type_table[numa_index][ALL_PROC] == proc_type_table[numa_index][EFFICIENT_CORE_PROC]) { + update_streams_per_node(MAIN_CORE_PROC, current_socket_info); + } else if (current_socket_info[ALL_PROC] == current_socket_info[EFFICIENT_CORE_PROC]) { stream_info[PROC_TYPE] = EFFICIENT_CORE_PROC; - update_streams_per_node(EFFICIENT_CORE_PROC, proc_type_table[numa_index]); + update_streams_per_node(EFFICIENT_CORE_PROC, current_socket_info); } else { stream_info[PROC_TYPE] = ALL_PROC; - update_mix_stream_info(proc_type_table[numa_index], - {proc_type_table[numa_index]}, + update_mix_stream_info(current_socket_info, + proc_type_table, n_threads_per_stream, IStreamsExecutor::Config::StreamsMode::SUB_STREAMS_NULL, ALL_PROC); } - update_ids_method(proc_type_table[numa_index]); + update_ids_method(current_socket_info); } } else { n_threads = @@ -333,8 +340,7 @@ std::vector> get_streams_info_table(const int input_streams, n_threads_per_stream = static_cast(n_threads / n_streams); check_threads_per_stream(); } else { - n_threads_per_stream = - model_threads > 0 ? model_threads : static_cast(n_threads / n_streams); + n_threads_per_stream = model_threads > 0 ? model_threads : static_cast(n_threads / n_streams); } } } @@ -584,7 +590,7 @@ int get_model_prefer_threads(const int num_streams, (networkToleranceForLowCache.ratio_mem_limited_gemms > ov::MemBandwidthPressure::LIMITED))) { config.modelPreferThreads = 8; } -#elif((defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)) +#elif ((defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)) config.modelPreferThreads = 1; if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) || diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp index e362c0373d8d1d..0a0b4a1449b7cb 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp @@ -44,15 +44,16 @@ namespace intel_cpu { * in previous function. * @return streams information table which will be used by StreamsExecutor. */ -std::vector> get_streams_info_table(const int input_streams, - const bool input_streams_changed, - const int input_threads, - const int input_infer_requests, - const int model_prefer_threads, - const int input_current_socket_id, - const std::string input_perf_hint, - const std::set hint_llm_distribution_policy, - const std::vector>& proc_type_table); +std::vector> get_streams_info_table( + const int input_streams, + const bool input_streams_changed, + const int input_threads, + const int input_infer_requests, + const int model_prefer_threads, + const int input_current_socket_id, + const std::string input_perf_hint, + const std::set hint_llm_distribution_policy, + const std::vector>& proc_type_table); /** * @brief Generate streams rank table for tensor parallel according to streams info table. @@ -106,9 +107,7 @@ std::vector> generate_stream_info(const int streams, * @param[in] model graph handle * @param[in] config intel cpu configuration */ -void get_num_streams(const int streams, - const std::shared_ptr& model, - Config& config); +void get_num_streams(const int streams, const std::shared_ptr& model, Config& config); } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_tensor.cpp b/src/plugins/intel_cpu/src/cpu_tensor.cpp index 1a045ca117a538..0f82a8a9a4dfec 100644 --- a/src/plugins/intel_cpu/src/cpu_tensor.cpp +++ b/src/plugins/intel_cpu/src/cpu_tensor.cpp @@ -16,7 +16,8 @@ Tensor::Tensor(MemoryPtr memptr) : m_memptr{memptr} { // only support plain data format ncsp. auto memdesc = m_memptr->getDescPtr(); - OPENVINO_ASSERT(memdesc->hasLayoutType(LayoutType::ncsp), "intel_cpu::Tensor only supports memory with ncsp layout."); + OPENVINO_ASSERT(memdesc->hasLayoutType(LayoutType::ncsp), + "intel_cpu::Tensor only supports memory with ncsp layout."); m_element_type = memdesc->getPrecision(); } @@ -24,8 +25,14 @@ Tensor::Tensor(MemoryPtr memptr) : m_memptr{memptr} { void Tensor::set_shape(ov::Shape new_shape) { const auto& shape = m_memptr->getDescPtr()->getShape(); if (shape.isStatic()) { - DEBUG_LOG("tensor's memory object ", m_memptr.get(), ", ", vec2str(shape.getStaticDims()), " -> ", new_shape.to_string()); - if (shape.getStaticDims() == new_shape) return; + DEBUG_LOG("tensor's memory object ", + m_memptr.get(), + ", ", + vec2str(shape.getStaticDims()), + " -> ", + new_shape.to_string()); + if (shape.getStaticDims() == new_shape) + return; } auto desc = m_memptr->getDescPtr(); @@ -69,7 +76,7 @@ void Tensor::update_strides() const { OPENVINO_ASSERT(blocked_desc, "not a valid blocked memory descriptor."); auto& strides = blocked_desc->getStrides(); m_strides.resize(strides.size()); - std::transform(strides.cbegin(), strides.cend(), m_strides.begin(), [this] (const size_t stride) { + std::transform(strides.cbegin(), strides.cend(), m_strides.begin(), [this](const size_t stride) { return stride * m_element_type.size(); }); } @@ -96,5 +103,5 @@ std::shared_ptr make_tensor(MemoryPtr mem) { return std::make_shared(mem); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_tensor.h b/src/plugins/intel_cpu/src/cpu_tensor.h index 0f073e0d298faf..86648ce969b168 100644 --- a/src/plugins/intel_cpu/src/cpu_tensor.h +++ b/src/plugins/intel_cpu/src/cpu_tensor.h @@ -4,8 +4,8 @@ #pragma once -#include "openvino/runtime/itensor.hpp" #include "cpu_memory.h" +#include "openvino/runtime/itensor.hpp" namespace ov { namespace intel_cpu { @@ -29,7 +29,9 @@ class Tensor : public ITensor { void* data(const element::Type& type = {}) const override; - MemoryPtr get_memory() {return m_memptr;} + MemoryPtr get_memory() { + return m_memptr; + } private: void update_strides() const; @@ -44,5 +46,5 @@ class Tensor : public ITensor { std::shared_ptr make_tensor(MemoryPtr mem); -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 3b6440e56c3272..67c538bd78341a 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -2,10 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // #include "cpu_types.h" -#include "cpu_shape.h" -#include #include +#include + +#include "cpu_shape.h" namespace ov { namespace intel_cpu { @@ -41,6 +42,9 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"GroupConvolution", Type::Convolution}, {"MatMul", Type::MatMul}, {"FullyConnected", Type::FullyConnected}, + {"FullyConnectedCompressed", Type::FullyConnected}, + {"FullyConnectedQuantizedLegacy", Type::FullyConnected}, + {"FullyConnectedQuantized", Type::FullyConnected}, {"MaxPool", Type::Pooling}, {"AvgPool", Type::Pooling}, {"AdaptiveMaxPool", Type::AdaptivePooling}, @@ -257,8 +261,7 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"QKVProjection", Type::QKVProjection}, {"RMS", Type::RMS}, {"SearchSorted", Type::SearchSorted}, - {"LoraSubgraph", Type::LoRA} - }; + {"LoraSubgraph", Type::LoRA}}; return type_to_name_tbl; } @@ -469,6 +472,10 @@ std::string algToString(const Algorithm alg) { CASE(FQCommon); CASE(FQQuantization); CASE(FQBinarization); + CASE(FullyConnectedCommon); + CASE(FullyConnectedCompressed); + CASE(FullyConnectedQuantized); + CASE(FullyConnectedQuantizedLegacy); CASE(ROIPoolingMax); CASE(ROIPoolingBilinear); CASE(ROIAlignMax); diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index 9461526184b0bf..71088c22af8336 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -213,6 +213,12 @@ enum class Algorithm { EltwiseBitwiseLeftShift, EltwiseBitwiseRightShift, + // FullyConnected algorithms + FullyConnectedCommon, + FullyConnectedCompressed, + FullyConnectedQuantized, + FullyConnectedQuantizedLegacy, + // FakeQuantize algorithms FQCommon, FQQuantization, diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 3d9b2f69bd8f66..457f8368f734dd 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -47,79 +47,79 @@ uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) { dnnl::memory::data_type DnnlExtensionUtils::ElementTypeToDataType(const ov::element::Type& elementType) { switch (elementType) { - case ov::element::f32: - return memory::data_type::f32; - case ov::element::i32: - return memory::data_type::s32; - case ov::element::bf16: - return memory::data_type::bf16; - case ov::element::i8: - return memory::data_type::s8; - case ov::element::u8: - case ov::element::boolean: - return memory::data_type::u8; - case ov::element::u1: - return memory::data_type::bin; - case ov::element::f16: - return memory::data_type::f16; - case ov::element::nf4: - return memory::data_type::nf4; - case ov::element::i4: - return memory::data_type::s4; - case ov::element::u4: - return memory::data_type::u4; - case ov::element::f8e8m0: - return memory::data_type::f8_e8m0; - case ov::element::f4e2m1: - return memory::data_type::f4_e2m1; - case ov::element::undefined: - return memory::data_type::undef; - default: { - OPENVINO_THROW("CPU plugin does not support ", elementType.to_string(), " for use with oneDNN."); - } + case ov::element::f32: + return memory::data_type::f32; + case ov::element::i32: + return memory::data_type::s32; + case ov::element::bf16: + return memory::data_type::bf16; + case ov::element::i8: + return memory::data_type::s8; + case ov::element::u8: + case ov::element::boolean: + return memory::data_type::u8; + case ov::element::u1: + return memory::data_type::bin; + case ov::element::f16: + return memory::data_type::f16; + case ov::element::nf4: + return memory::data_type::nf4; + case ov::element::i4: + return memory::data_type::s4; + case ov::element::u4: + return memory::data_type::u4; + case ov::element::f8e8m0: + return memory::data_type::f8_e8m0; + case ov::element::f4e2m1: + return memory::data_type::f4_e2m1; + case ov::element::undefined: + return memory::data_type::undef; + default: { + OPENVINO_THROW("CPU plugin does not support ", elementType.to_string(), " for use with oneDNN."); + } } } ov::element::Type DnnlExtensionUtils::DataTypeToElementType(const dnnl::memory::data_type& dataType) { switch (dataType) { - case memory::data_type::f32: - return ov::element::f32; - case memory::data_type::s32: - return ov::element::i32; - case memory::data_type::bf16: - return ov::element::bf16; - case memory::data_type::s8: - return ov::element::i8; - case memory::data_type::u8: - return ov::element::u8; - case memory::data_type::bin: - return ov::element::u1; - case memory::data_type::f16: - return ov::element::f16; - case memory::data_type::f64: - return ov::element::f64; - case memory::data_type::nf4: - return ov::element::nf4; - case memory::data_type::s4: - return ov::element::i4; - case memory::data_type::u4: - return ov::element::u4; - case memory::data_type::f8_e8m0: - return ov::element::f8e8m0; - case memory::data_type::f4_e2m1: - return ov::element::f4e2m1; - case memory::data_type::undef: - return ov::element::undefined; - default: { - OPENVINO_THROW("Unsupported data type."); - } + case memory::data_type::f32: + return ov::element::f32; + case memory::data_type::s32: + return ov::element::i32; + case memory::data_type::bf16: + return ov::element::bf16; + case memory::data_type::s8: + return ov::element::i8; + case memory::data_type::u8: + return ov::element::u8; + case memory::data_type::bin: + return ov::element::u1; + case memory::data_type::f16: + return ov::element::f16; + case memory::data_type::f64: + return ov::element::f64; + case memory::data_type::nf4: + return ov::element::nf4; + case memory::data_type::s4: + return ov::element::i4; + case memory::data_type::u4: + return ov::element::u4; + case memory::data_type::f8_e8m0: + return ov::element::f8e8m0; + case memory::data_type::f4_e2m1: + return ov::element::f4e2m1; + case memory::data_type::undef: + return ov::element::undefined; + default: { + OPENVINO_THROW("Unsupported data type."); + } } } -Dim DnnlExtensionUtils::convertToDim(const dnnl::memory::dim &dim) { - return dim == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : static_cast(dim); +Dim DnnlExtensionUtils::convertToDim(const dnnl::memory::dim& dim) { + return dim == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : static_cast(dim); } -dnnl::memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim &dim) { +dnnl::memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim& dim) { return dim == Shape::UNDEFINED_DIM ? DNNL_RUNTIME_DIM_VAL : static_cast(dim); } @@ -141,25 +141,25 @@ memory::dims DnnlExtensionUtils::convertToDnnlDims(const VectorDims& dims) { memory::format_tag DnnlExtensionUtils::GetPlainFormatByRank(size_t rank) { switch (rank) { - case 0: - case 1: - return memory::format_tag::a; - case 2: - return memory::format_tag::ab; - case 3: - return memory::format_tag::abc; - case 4: - return memory::format_tag::abcd; - case 5: - return memory::format_tag::abcde; - case 6: - return memory::format_tag::abcdef; - default: - return memory::format_tag::undef; + case 0: + case 1: + return memory::format_tag::a; + case 2: + return memory::format_tag::ab; + case 3: + return memory::format_tag::abc; + case 4: + return memory::format_tag::abcd; + case 5: + return memory::format_tag::abcde; + case 6: + return memory::format_tag::abcdef; + default: + return memory::format_tag::undef; } } -DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc &desc) { +DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc& desc) { return makeDescriptor(desc.get()); } @@ -182,7 +182,8 @@ size_t DnnlExtensionUtils::getMemSizeForDnnlDesc(const dnnl::memory::desc& desc) return size; } -std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const memory::desc &desc, const Shape &shape) { +std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const memory::desc& desc, + const Shape& shape) { if (desc.get_format_kind() == memory::format_kind::blocked) { return std::shared_ptr(new DnnlBlockedMemoryDesc(desc, shape)); } else { @@ -190,7 +191,9 @@ std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(con } } -DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx) { +DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, + const dnnl::query& what, + int idx) { auto query = dnnl::convert_to_c(what); const auto* cdesc = dnnl_primitive_desc_query_md(pd, query, idx); @@ -201,7 +204,7 @@ DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t } std::string DnnlExtensionUtils::query_impl_info_str(const const_dnnl_primitive_desc_t& pd) { - const char *res; + const char* res; dnnl_status_t status = dnnl_primitive_desc_query(pd, dnnl_query_impl_info_str, 0, &res); if (status != dnnl_success) OPENVINO_THROW("query_impl_info_str failed."); @@ -209,10 +212,9 @@ std::string DnnlExtensionUtils::query_impl_info_str(const const_dnnl_primitive_d } bool DnnlExtensionUtils::find_implementation(dnnl::primitive_desc& desc, impl_desc_type impl_type) { - return DnnlExtensionUtils::find_implementation(desc, - [impl_type](impl_desc_type cur_impl_type){ - return cur_impl_type == impl_type; - }); + return DnnlExtensionUtils::find_implementation(desc, [impl_type](impl_desc_type cur_impl_type) { + return cur_impl_type == impl_type; + }); } dnnl_memory_desc_t DnnlExtensionUtils::clone_desc(const_dnnl_memory_desc_t cdesc) { @@ -233,31 +235,33 @@ const char* DnnlExtensionUtils::query_pd_info(const_dnnl_primitive_desc_t pd) { bool DnnlExtensionUtils::isUnarySupportedAsPostOp(Algorithm alg) { #if defined(OV_CPU_WITH_ACL) - return one_of(alg, Algorithm::EltwiseRelu, - Algorithm::EltwiseTanh, - Algorithm::EltwiseElu, - Algorithm::EltwiseAbs, - Algorithm::EltwiseSqrt, - Algorithm::EltwiseSoftRelu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp); + return one_of(alg, + Algorithm::EltwiseRelu, + Algorithm::EltwiseTanh, + Algorithm::EltwiseElu, + Algorithm::EltwiseAbs, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseSoftRelu, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseClamp); #elif defined(OPENVINO_ARCH_X86_64) - return one_of(alg, Algorithm::EltwiseRelu, - Algorithm::EltwiseGeluErf, - Algorithm::EltwiseGeluTanh, - Algorithm::EltwiseElu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp, - Algorithm::EltwiseTanh, - Algorithm::EltwiseSwish, - Algorithm::EltwiseHswish, - Algorithm::EltwiseMish, - Algorithm::EltwiseHsigmoid, - Algorithm::EltwiseRoundHalfToEven, - Algorithm::EltwiseRoundHalfAwayFromZero, - Algorithm::EltwiseAbs, - Algorithm::EltwiseSqrt, - Algorithm::EltwiseSoftRelu); + return one_of(alg, + Algorithm::EltwiseRelu, + Algorithm::EltwiseGeluErf, + Algorithm::EltwiseGeluTanh, + Algorithm::EltwiseElu, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseClamp, + Algorithm::EltwiseTanh, + Algorithm::EltwiseSwish, + Algorithm::EltwiseHswish, + Algorithm::EltwiseMish, + Algorithm::EltwiseHsigmoid, + Algorithm::EltwiseRoundHalfToEven, + Algorithm::EltwiseRoundHalfAwayFromZero, + Algorithm::EltwiseAbs, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseSoftRelu); #else return false; #endif @@ -269,5 +273,5 @@ std::string DnnlExtensionUtils::computeWeightsStringHash(const std::shared_ptr(memory->getData())); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.h b/src/plugins/intel_cpu/src/dnnl_extension_utils.h index 7a968ea3c71c3d..ecf223b48497cd 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.h +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.h @@ -10,11 +10,11 @@ #include +#include "common/c_types_map.hpp" #include "cpu_types.h" #include "onednn/dnnl.h" #include "onednn/iml_type_mapper.h" #include "openvino/core/type/element_type.hpp" -#include "common/c_types_map.hpp" namespace ov { namespace intel_cpu { @@ -29,8 +29,8 @@ class DnnlExtensionUtils { static uint8_t sizeOfDataType(dnnl::memory::data_type dataType); static dnnl::memory::data_type ElementTypeToDataType(const ov::element::Type& elementType); static ov::element::Type DataTypeToElementType(const dnnl::memory::data_type& dataType); - static Dim convertToDim(const dnnl::memory::dim &dim); - static dnnl::memory::dim convertToDnnlDim(const Dim &dim); + static Dim convertToDim(const dnnl::memory::dim& dim); + static dnnl::memory::dim convertToDnnlDim(const Dim& dim); static VectorDims convertToVectorDims(const dnnl::memory::dims& dims); static VectorDims convertToVectorDims(const dnnl::impl::dims_t dims, const int ndims); static std::vector convertToDnnlDims(const VectorDims& dims); @@ -41,25 +41,28 @@ class DnnlExtensionUtils { * @param desc dnnl::memory::desc from which one of the descriptors will be created * @return pointer to DnnlBlockedMemoryDesc or DnnlMemoryDesc */ - static std::shared_ptr makeDescriptor(const dnnl::memory::desc &desc); + static std::shared_ptr makeDescriptor(const dnnl::memory::desc& desc); static std::shared_ptr makeDescriptor(const_dnnl_memory_desc_t desc); /** * @brief Helper function that creates DnnlBlockedMemoryDesc from defined dnnl::memory::desc and undefined shape. - * It uses desc as an basis for the new undefined one. Specifically, type, layout, precision, blocks, extra data will be preserved. + * It uses desc as an basis for the new undefined one. Specifically, type, layout, precision, blocks, extra data + * will be preserved. * @param desc dnnl::memory::desc dnnl desc which will be used as a basis of the new descriptor * @param shape a new undefined shape * @return pointer to the created DnnlBlockedMemoryDesc * @note Obly blocked descriptors are allowed at the moment */ - static std::shared_ptr makeUndefinedDesc(const dnnl::memory::desc &desc, const Shape& shape); + static std::shared_ptr makeUndefinedDesc(const dnnl::memory::desc& desc, const Shape& shape); static size_t getMemSizeForDnnlDesc(const dnnl::memory::desc& desc); - static std::shared_ptr query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx = 0); + static std::shared_ptr query_md(const const_dnnl_primitive_desc_t& pd, + const dnnl::query& what, + int idx = 0); static std::string query_impl_info_str(const const_dnnl_primitive_desc_t& pd); - template + template static bool find_implementation(dnnl::primitive_desc& desc, T&& comparator) { dnnl::primitive_desc_iterator& itpd = desc; @@ -77,7 +80,7 @@ class DnnlExtensionUtils { return false; } - template + template static void for_each_implementation(dnnl::primitive_desc& desc, bool first_match, T&& comparator, L&& func) { dnnl::primitive_desc_iterator& itpd = desc; @@ -113,5 +116,5 @@ class DnnlExtensionUtils { const std::shared_ptr& dstDesc); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp index 2f82fbe553ae19..9b86a1433acb06 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp @@ -11,8 +11,12 @@ #include #include +#include "cpu_types.h" #include "memory_desc/dnnl_blocked_memory_desc.h" +#include "nodes/executors/common/common_utils.hpp" +#include "nodes/executors/memory_arguments.hpp" #include "openvino/core/type/element_type.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -24,8 +28,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, const size_t indexOfOutputChannelDim, const bool isInt8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType) : engine(engine), postOps(postOps), @@ -39,6 +42,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, dimsPerOC = dimsPerTensor = VectorDims(outputDims.size(), 1); dimsPerOC[idxOC] = OC; + const auto& DQScales = getDeQuantizedScales(memory); // generalise dq scales, so extra logic is necessary here. if (isINT8) { wei_scale_values = DQScales.empty() ? std::vector{1.0} : DQScales; @@ -49,6 +53,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, updateWeiScales(); // If having the bias, attr weight scale can't be updated for further ops-ops optimization. // ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. + const bool hasBias = !memory.at(ARG_BIAS)->getDesc().empty(); weightScaleAvailable = !hasBias; } else if (!DQScales.empty()) { // DQ scale is fused but swiching back to non-INT8 for execution in some cases. @@ -325,9 +330,9 @@ static OptimizedFormula updateOptimizedFormula(const FakeQuantizePostOp& postOp, } bool DnnlPostOpsComposer::appendAttrPostOps(const FakeQuantizePostOp& postOp, - bool isLastPostOp, - bool doRounding, - bool allowBinary) { + bool isLastPostOp, + bool doRounding, + bool allowBinary) { DEBUG_LOG("isLastPostOp=", isLastPostOp, ", outDataType=", @@ -541,9 +546,9 @@ bool DnnlPostOpsComposer::appendShift(const std::vector& shift, bool allo } bool DnnlPostOpsComposer::appendLinear(const std::vector& scale, - const std::vector& shift, - bool isLastPostOp, - bool allowBinary) { + const std::vector& shift, + bool isLastPostOp, + bool allowBinary) { if (scale.size() == 1 && shift.size() == 1) { if (shift[0] == 0.0f) return appendScale(scale, isLastPostOp, allowBinary); @@ -599,15 +604,27 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, if (shape.size() == 1 && shape[0] == 1) { shape.push_back(1); } + if (shape.size() != 2 && shape.size() != 3) - OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); + OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); - Shape dstShape = needTranspose ? Shape({shape[0], shape[1]}) : Shape({shape[shape.size() - 1], shape[0]}); - DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(dstPrc), dnnl::memory::format_tag::io); - auto dstMem = std::make_shared(engine, dstMemoryDesc); + // weights without batch: (OC, G) + // weights with batch: (B, OC, G) + const size_t OC = shape[shape.size() - 2]; + const size_t G = shape[shape.size() - 1]; + + Shape dstShape = Shape({OC, G}); + DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, + DnnlExtensionUtils::ElementTypeToDataType(dstPrc), + dnnl::memory::format_tag::io); + auto dstMem = std::make_shared(engine, dstMemoryDesc); auto srcFormat = needTranspose ? dnnl::memory::format_tag::oi : dnnl::memory::format_tag::io; - DnnlBlockedMemoryDesc srcMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), srcFormat); + + DnnlBlockedMemoryDesc srcMemoryDesc( + dstShape, + DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), + srcFormat); auto srcMem = std::make_shared(engine, srcMemoryDesc, paramsPtr->getData()); dstMem->load(*srcMem); @@ -615,25 +632,31 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, return dstMem; } -void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (scales_ptr == nullptr) return; auto scalesMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine); attr.set_scales_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = std::move(scalesMem); dnnlArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive(); } -void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (zero_points_ptr == nullptr) return; auto zeroPointsMem = prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine); attr.set_zero_points_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem; dnnlArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem->getPrimitive(); } diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h index c07ec0f608b6db..7ae634658b005f 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h @@ -12,8 +12,8 @@ #include "cpu_memory.h" #include "nodes/executors/dnnl/dnnl_aliases.hpp" -#include "post_ops.hpp" #include "nodes/executors/dnnl/dnnl_post_op_data.hpp" +#include "post_ops.hpp" namespace ov { namespace intel_cpu { @@ -27,12 +27,13 @@ class DnnlPostOpsComposer { const size_t indexOfOutputChannelDim, const bool isINT8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType); DnnlPrimitiveAttrs compose(); void appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision); - void appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, ov::element::Type dstPrecision); + void appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision); void setDynamicQuantizationParams(uint64_t groupSize); private: diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp index cb59492463f410..3e40ead65d6cc3 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp @@ -3,9 +3,11 @@ // #include "dnnl_postops_composer_legacy.h" + #include #include + #include "utils/debug_capabilities.h" namespace ov { @@ -39,10 +41,10 @@ DnnlPostOpsComposerLegacy::DnnlPostOpsComposerLegacy(const dnnl::engine& engine, wei_scale_mask = wei_scale_values.size() > 1 ? weiScaleMaskPerChannel : 0; dst_scale_val = 1.0; - //set the DQscale into attr weight scale before appending any post-ops. + // set the DQscale into attr weight scale before appending any post-ops. updateWeiScales(); - //If having the bias, attr weight scale can't be updated for further ops-ops optimization. - //ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. + // If having the bias, attr weight scale can't be updated for further ops-ops optimization. + // ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. weightScaleAvailable = !hasBias; } else if (!DQScales.empty()) { // DQ scale is fused but swiching back to non-INT8 for execution in some cases. @@ -115,22 +117,22 @@ bool DnnlPostOpsComposerLegacy::appendScale(const std::vector& scale, boo return true; } if (weightScaleAvailable) { - //oneDNN v3.* weight scale can also be used in the further optimization patterns. - // there are so many possible optimizations can be done, for example: + // oneDNN v3.* weight scale can also be used in the further optimization patterns. + // there are so many possible optimizations can be done, for example: // - // we can switch the existing postOps's order to take - // advantage of output scale if it's available: - // relu(x)*scale = relu(x*scale) - // or we can fuse it into previous one as long as they are - // compatible in shape - // x*A*s = x*(A*s) - // or even with add: - // (x*A + B)*s = x*(A*s) + (B*s) - // or we can combine these two tricks: - // relu(x*A)*s = relu(x*(A*s)) + // we can switch the existing postOps's order to take + // advantage of output scale if it's available: + // relu(x)*scale = relu(x*scale) + // or we can fuse it into previous one as long as they are + // compatible in shape + // x*A*s = x*(A*s) + // or even with add: + // (x*A + B)*s = x*(A*s) + (B*s) + // or we can combine these two tricks: + // relu(x*A)*s = relu(x*(A*s)) // - // we cannot implement all of them, so we just add the one - // that we observed in real models. + // we cannot implement all of them, so we just add the one + // that we observed in real models. if ((ops.len() == 0)) fuseIntoWeiScale = true; @@ -201,9 +203,9 @@ bool DnnlPostOpsComposerLegacy::appendShift(const std::vector& shift, boo } bool DnnlPostOpsComposerLegacy::appendLinear(const std::vector& scale, - const std::vector& shift, - bool isLastPostOp, - bool allowBinary) { + const std::vector& shift, + bool isLastPostOp, + bool allowBinary) { if (scale.size() == 1 && shift.size() == 1) { if (shift[0] == 0.0f) return appendScale(scale, isLastPostOp, allowBinary); diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h index 82fdda94012f15..485fa31fb5d956 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h @@ -8,11 +8,10 @@ */ #pragma once -#include "dnnl_types.h" - #include #include "cpu_memory.h" +#include "dnnl_types.h" #include "memory_desc/cpu_memory_desc.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "onednn/dnnl.h" @@ -39,7 +38,10 @@ class DnnlPostOpsComposerLegacy { void appendRoundHTE(); bool appendScale(const std::vector& scale, bool isLastPostOp, bool allowBinary = true); bool appendShift(const std::vector& shift, bool allowBinary = true); - bool appendLinear(const std::vector& scale, const std::vector& shift, bool isLastPostOp, bool allowBinary = true); + bool appendLinear(const std::vector& scale, + const std::vector& shift, + bool isLastPostOp, + bool allowBinary = true); void appendClip(const std::vector& low, const std::vector& high); const VectorDims& getOutputDims() { diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index 82bde8edae2b4a..1eabc6275bf4b0 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -3,16 +3,21 @@ // #include "edge.h" -#include "node.h" + #include "dnnl_extension_utils.h" +#include "node.h" +#include "openvino/core/type/element_type.hpp" #include "openvino/util/pp.hpp" using namespace dnnl; namespace ov { namespace intel_cpu { -Edge::Edge(const NodePtr &parent, const NodePtr &child, int pr_port, int ch_port) : - parent(parent), child(child), parent_port(pr_port), child_port(ch_port) {} +Edge::Edge(const NodePtr& parent, const NodePtr& child, int pr_port, int ch_port) + : parent(parent), + child(child), + parent_port(pr_port), + child_port(ch_port) {} const NodePtr Edge::getParent() const { auto parentPtr = parent.lock(); @@ -38,14 +43,14 @@ bool Edge::isDropped() const { auto parent_ptr = parent.lock(); if (parent_ptr) { - for (auto &edge : parent_ptr->childEdges) + for (auto& edge : parent_ptr->childEdges) if (edge.lock().get() == this) not_in_parent = false; } auto child_ptr = child.lock(); if (child_ptr) { - for (auto &edge : child_ptr->parentEdges) + for (auto& edge : child_ptr->parentEdges) if (edge.lock().get() == this) not_in_child = false; } @@ -130,8 +135,8 @@ bool Edge::enforceReorder() { } static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const MemoryDesc& rhsMemDesc) { - if (!lhsMemDesc.isDefined() || !rhsMemDesc.isDefined() || - !(lhsMemDesc.getType() & MemoryDescType::Blocked) || !(rhsMemDesc.getType() & MemoryDescType::Blocked) || + if (!lhsMemDesc.isDefined() || !rhsMemDesc.isDefined() || !(lhsMemDesc.getType() & MemoryDescType::Blocked) || + !(rhsMemDesc.getType() & MemoryDescType::Blocked) || (lhsMemDesc.getType() == DnnlBlocked && !lhsMemDesc.as()->hasEmptyExtraData()) || (rhsMemDesc.getType() == DnnlBlocked && !rhsMemDesc.as()->hasEmptyExtraData())) return false; @@ -139,13 +144,21 @@ static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const M const auto lhsBlockMemDesc = lhsMemDesc.as(); const auto rhsBlockMemDesc = rhsMemDesc.as(); - if (lhsBlockMemDesc->getShape() != rhsBlockMemDesc->getShape() || lhsBlockMemDesc->getPrecision() != rhsBlockMemDesc->getPrecision()) + if (lhsBlockMemDesc->getShape() != rhsBlockMemDesc->getShape() || + lhsBlockMemDesc->getPrecision() != rhsBlockMemDesc->getPrecision()) return false; // dims padding check - bool isZeroDimsPaddings = - std::all_of(lhsBlockMemDesc->getOffsetPaddingToData().begin(), lhsBlockMemDesc->getOffsetPaddingToData().end(), [](size_t x){ return x == 0; }) && - std::all_of(rhsBlockMemDesc->getOffsetPaddingToData().begin(), rhsBlockMemDesc->getOffsetPaddingToData().end(), [](size_t x){ return x == 0; }); + bool isZeroDimsPaddings = std::all_of(lhsBlockMemDesc->getOffsetPaddingToData().begin(), + lhsBlockMemDesc->getOffsetPaddingToData().end(), + [](size_t x) { + return x == 0; + }) && + std::all_of(rhsBlockMemDesc->getOffsetPaddingToData().begin(), + rhsBlockMemDesc->getOffsetPaddingToData().end(), + [](size_t x) { + return x == 0; + }); bool isSameElementsCount = lhsBlockMemDesc->getPaddedElementsCount() == rhsBlockMemDesc->getPaddedElementsCount(); if (!isZeroDimsPaddings || !isSameElementsCount) return false; @@ -160,7 +173,8 @@ static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const M std::vector lhsStridesDefault(lhsBlockDims.size()); lhsStridesDefault[lhsBlockDims.size() - 1] = 1; for (size_t i = 2; i <= lhsBlockDims.size(); i++) { - lhsStridesDefault[lhsBlockDims.size() - i] = lhsStridesDefault[lhsBlockDims.size() - (i - 1)] * lhsBlockDims[lhsBlockDims.size() - (i - 1)]; + lhsStridesDefault[lhsBlockDims.size() - i] = + lhsStridesDefault[lhsBlockDims.size() - (i - 1)] * lhsBlockDims[lhsBlockDims.size() - (i - 1)]; } auto rhsBlockDims = rhsBlockMemDesc->getBlockDims(); @@ -168,11 +182,11 @@ static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const M rhsStridesDefault[rhsBlockDims.size() - 1] = 1; for (size_t i = 2; i <= rhsBlockDims.size(); i++) { rhsStridesDefault[rhsBlockDims.size() - i] = - rhsStridesDefault[rhsBlockDims.size() - (i - 1)] * rhsBlockDims[rhsBlockDims.size() - (i - 1)]; + rhsStridesDefault[rhsBlockDims.size() - (i - 1)] * rhsBlockDims[rhsBlockDims.size() - (i - 1)]; } - // this check needed to avoid inserting unnecessary reorders if the memory is used in place and the batch size is equal to 1 - // in nodes like concate and split + // this check needed to avoid inserting unnecessary reorders if the memory is used in place and the batch size is + // equal to 1 in nodes like concate and split size_t lhsSkipAxis = lhsBlockDims.size() > 0 && lhsBlockDims[0] == 1 ? 0 : Shape::UNDEFINED_DIM; size_t rhsSkipAxis = rhsBlockDims.size() > 0 && rhsBlockDims[0] == 1 ? 0 : Shape::UNDEFINED_DIM; @@ -212,10 +226,16 @@ Edge::ReorderStatus Edge::needReorder() { bool optimized = false; auto inputPortDesc = getInputPortDesc(); auto outPortDesc = getOutputPortDesc(); + + if (inputPortDesc->getMemDesc()->getPrecision() == element::undefined) + return ReorderStatus::No; + // Check whether the child node may accept the parent produced tensor if (!outPortDesc->isCompatible(*inputPortDesc)) { - // Performance optimization which exploit the fact that some tensors do not need actual data reordering to be read using different descriptors - if (isPhycicalMemCompatible(*inputPortDesc->getMemDesc(), *outPortDesc->getMemDesc()) && !getParent()->isConstant()) { + // Performance optimization which exploit the fact that some tensors do not need actual data reordering to be + // read using different descriptors + if (isPhycicalMemCompatible(*inputPortDesc->getMemDesc(), *outPortDesc->getMemDesc()) && + !getParent()->isConstant()) { optimized = true; } else { return ReorderStatus::Regular; @@ -292,8 +312,8 @@ std::string Edge::hash() const { std::stringstream result; - return parentPtr->getName() + "_" + std::to_string(parent_port) + "_" + - childPtr->getName() + "_" + std::to_string(child_port); + return parentPtr->getName() + "_" + std::to_string(parent_port) + "_" + childPtr->getName() + "_" + + std::to_string(child_port); } void Edge::externalAllocate(WeightsSharing::Ptr weightsCache) { @@ -301,10 +321,13 @@ void Edge::externalAllocate(WeightsSharing::Ptr weightsCache) { return; if (weightsCache) { - auto alloc = [this] () { + auto alloc = [this]() { auto allocateFunc = [this](const MemoryDesc& inputDesc) -> MemoryPtr { auto parentPtr = getParent(); - return std::make_shared(parentPtr->getEngine(), inputDesc, nullptr, false); // no pads zeroing + return std::make_shared(parentPtr->getEngine(), + inputDesc, + nullptr, + false); // no pads zeroing }; allocateCommon(allocateFunc); @@ -410,13 +433,16 @@ const MemoryDesc& Edge::getOutputDesc() const { } const MemoryDesc& Edge::getDesc() const { + if (getInputDesc().getPrecision() == element::undefined) + return getInputDesc(); + if (!getInputDesc().isCompatible(getOutputDesc())) OPENVINO_THROW("Cannot get descriptor for edge: ", getParent()->getName(), "->", getChild()->getName()); return getInputDesc(); } -const IMemory &Edge::getMemory() { +const IMemory& Edge::getMemory() { auto memPtr = getMemoryPtr(); OPENVINO_ASSERT(memPtr != nullptr, " Dereferencing NULL memory in edge: ", *this); return *memPtr; @@ -426,7 +452,7 @@ MemoryPtr Edge::getMemoryPtr() const { return memoryPtr; } -void Edge::sharedMemFrom(const EdgePtr &edge) { +void Edge::sharedMemFrom(const EdgePtr& edge) { memoryFromEdge = edge; DEBUG_LOG(*this, " sharedMemFrom ", *edge); status = Status::NotAllocated; @@ -466,10 +492,8 @@ void Edge::init() { DEBUG_LOG(*this, " getBaseEdge() return itself"); changeStatus(Status::NeedAllocation); } else { - if (Type::Input == edgePtr->getParent()->getType() && - Type::MemoryInput != getParent()->getType() && - edgePtr->getParent()->isConstant() && - !edgePtr->getChild()->isConstant()) { + if (Type::Input == edgePtr->getParent()->getType() && Type::MemoryInput != getParent()->getType() && + edgePtr->getParent()->isConstant() && !edgePtr->getChild()->isConstant()) { changeStatus(Status::NeedAllocation); DEBUG_LOG(*this, " edge inplace from ", *edgePtr, " is broken!"); return; @@ -497,11 +521,11 @@ EdgePtr Edge::getBaseEdge(int look) { if ((childInPlacePort >= 0) && (look & LOOK_DOWN)) { auto ch_edges = getChild()->getChildEdgesAtPort(childInPlacePort); - auto &next_ch_edge = ch_edges[0]; + auto& next_ch_edge = ch_edges[0]; // Multiple connection to some out port // Will try to find inplace consumer - for (auto &ch_edge : ch_edges) { + for (auto& ch_edge : ch_edges) { if (ch_edge->getChild()->inPlaceInputPort(ch_edge->getOutputNum()) >= 0) { next_ch_edge = ch_edge; // To align with upstream-inplace, we stop searching once found the first inplace consumer @@ -517,14 +541,16 @@ EdgePtr Edge::getBaseEdge(int look) { for (auto edge : edgesForSamePort) { if (edge.get() != this) { // Return once found the first inplace consumer - if (edge->inPlace()) return edge; + if (edge->inPlace()) + return edge; } } // Return the first output edge as the base if there is no inPlace consumers // thus benefits zero-copy of outputs. for (auto edge : edgesForSamePort) { - if (Type::Output == edge->getChild()->getType()) return edge; + if (Type::Output == edge->getChild()->getType()) + return edge; } return edgesForSamePort[0]; @@ -571,7 +597,7 @@ NodePtr Edge::modifiedInPlace() const { for (size_t i = 0; i < outConfs.size(); ++i) { const auto& conf = outConfs[i]; if (childPort < 0 || conf.inPlace() != childPort || - Type::MemoryInput == childNode->getType()) { //exception type, it doesn't modify memory + Type::MemoryInput == childNode->getType()) { // exception type, it doesn't modify memory continue; } if (childNode->isExecutable()) { @@ -591,12 +617,14 @@ NodePtr Edge::modifiedInPlace() const { return nullptr; } -std::ostream& operator<<(std::ostream &os, const Edge& edge) { - return os << "(" << edge.getParent()->getName() << ")" << "[" << edge.getInputNum() << "] " +std::ostream& operator<<(std::ostream& os, const Edge& edge) { + return os << "(" << edge.getParent()->getName() << ")" + << "[" << edge.getInputNum() << "] " << "<->" - << "(" << edge.getChild()->getName() << ")" << "[" << edge.getOutputNum() << "]" + << "(" << edge.getChild()->getName() << ")" + << "[" << edge.getOutputNum() << "]" << ":" << Edge::statusToString(edge.getStatus()); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/edge.h b/src/plugins/intel_cpu/src/edge.h index 5c418b2665924d..38f49ff00db075 100644 --- a/src/plugins/intel_cpu/src/edge.h +++ b/src/plugins/intel_cpu/src/edge.h @@ -4,15 +4,15 @@ #pragma once +#include +#include + #include "cpu_shape.h" #include "internal_properties.hpp" #include "memory_desc/cpu_memory_desc.h" #include "nodes/node_config.h" #include "weights_cache.hpp" -#include -#include - namespace ov { namespace intel_cpu { @@ -24,23 +24,11 @@ using EdgeWeakPtr = std::weak_ptr; class Edge { public: - Edge(const std::shared_ptr& parent, - const std::shared_ptr& child, - int pr_port = 0, int ch_port = 0); - - enum class Status { - Uninitialized, - NeedAllocation, - NotAllocated, - Allocated, - Validated - }; - - enum class ReorderStatus { - Regular = 0, - Optimized = 1, - No = 2 - }; + Edge(const std::shared_ptr& parent, const std::shared_ptr& child, int pr_port = 0, int ch_port = 0); + + enum class Status { Uninitialized, NeedAllocation, NotAllocated, Allocated, Validated }; + + enum class ReorderStatus { Regular = 0, Optimized = 1, No = 2 }; enum LOOK { LOOK_UP = 1, LOOK_DOWN = 2, LOOK_BOTH = LOOK_UP | LOOK_DOWN }; @@ -52,15 +40,15 @@ class Edge { #define CASE(_status) \ case Status::_status: \ return #_status; - switch (status) { - CASE(Uninitialized); - CASE(NeedAllocation); - CASE(NotAllocated); - CASE(Allocated); - CASE(Validated); - } + switch (status) { + CASE(Uninitialized); + CASE(NeedAllocation); + CASE(NotAllocated); + CASE(Allocated); + CASE(Validated); + } #undef CASE - return "Unexpected"; + return "Unexpected"; } void changeStatus(Status state); @@ -87,7 +75,9 @@ class Edge { int getInputNum() const; int getOutputNum() const; - void setChildPort(const size_t port) { child_port = port; } + void setChildPort(const size_t port) { + child_port = port; + } void sharedMemFrom(const EdgePtr& edge); EdgePtr getSharedEdge() const; @@ -126,8 +116,7 @@ class Edge { friend class Graph; }; -std::ostream& operator<<(std::ostream &os, const Edge& edge); - -} // namespace intel_cpu -} // namespace ov +std::ostream& operator<<(std::ostream& os, const Edge& edge); +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp index 7383b8b8f9ddab..43417942e8bc53 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_conversion_emitters.hpp" + #include "emitters/utils.hpp" using namespace dnnl::impl::cpu::aarch64; @@ -25,27 +26,27 @@ namespace aarch64 { // does not distinguish ARMv8.2 with ARMv8.2-A, conversion between f16 and i16 will still use three // instructions f16 -> f32 -> i32 -> i16 (f16 <- f32 <- i32 <- i16). template -inline void jit_convert_emitter::cvt_f16_to_f32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f16_to_f32(const TReg& src, const TReg& dst) const { h->fcvtl(dst.s4, src.h4); } template -inline void jit_convert_emitter::cvt_f32_to_f16(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f32_to_f16(const TReg& src, const TReg& dst) const { h->fcvtn(dst.h4, src.s4); } template -inline void jit_convert_emitter::cvt_f32_to_i32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f32_to_i32(const TReg& src, const TReg& dst) const { h->fcvtzs(dst.s, src.s); } template -inline void jit_convert_emitter::cvt_i32_to_f32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_i32_to_f32(const TReg& src, const TReg& dst) const { h->scvtf(dst.s, src.s); } template -inline void jit_convert_emitter::cvt_i32_to_i16(const TReg &src, const TReg &dst, bool is_saturated) const { +inline void jit_convert_emitter::cvt_i32_to_i16(const TReg& src, const TReg& dst, bool is_saturated) const { if (is_saturated) { h->sqxtn(dst.h4, src.s4); } else { @@ -54,22 +55,25 @@ inline void jit_convert_emitter::cvt_i32_to_i16(const TReg &src, const TReg &dst } template -inline void jit_convert_emitter::cvt_i16_to_i32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_i16_to_i32(const TReg& src, const TReg& dst) const { h->sxtl(dst.s4, src.h4); } template -inline void jit_convert_emitter::cvt_f16_to_i16(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f16_to_i16(const TReg& src, const TReg& dst) const { h->fcvtzs(dst.h4, src.h4); } template -inline void jit_convert_emitter::cvt_i16_to_f16(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_i16_to_f16(const TReg& src, const TReg& dst) const { h->scvtf(dst.h4, src.h4); } template -inline void jit_convert_emitter::cvt_i16_to_byte(const TReg &src, const TReg &dst, bool is_signed, bool is_saturated) const { +inline void jit_convert_emitter::cvt_i16_to_byte(const TReg& src, + const TReg& dst, + bool is_signed, + bool is_saturated) const { if (is_saturated) { if (is_signed) { h->sqxtn(dst.b8, src.h8); @@ -82,7 +86,7 @@ inline void jit_convert_emitter::cvt_i16_to_byte(const TReg &src, const TReg &ds } template -inline void jit_convert_emitter::cvt_byte_to_i16(const TReg &src, const TReg &dst, bool is_signed) const { +inline void jit_convert_emitter::cvt_byte_to_i16(const TReg& src, const TReg& dst, bool is_signed) const { if (is_signed) { h->sxtl(dst.h8, src.b8); } else { @@ -91,10 +95,13 @@ inline void jit_convert_emitter::cvt_byte_to_i16(const TReg &src, const TReg &ds } template -void jit_convert_emitter::jit_convert_process(const TReg &src, const TReg &dst, ov::element::Type input_type, ov::element::Type output_type, +void jit_convert_emitter::jit_convert_process(const TReg& src, + const TReg& dst, + ov::element::Type input_type, + ov::element::Type output_type, bool is_saturated) const { - if (input_type == output_type || (!is_saturated && - one_of(input_type, ov::element::i8, ov::element::u8) && one_of(output_type, ov::element::i8, ov::element::u8))) { + if (input_type == output_type || (!is_saturated && one_of(input_type, ov::element::i8, ov::element::u8) && + one_of(output_type, ov::element::i8, ov::element::u8))) { if (src.getIdx() != dst.getIdx()) { h->mov(dst.b16, src.b16); } @@ -102,119 +109,130 @@ void jit_convert_emitter::jit_convert_process(const TReg &src, const TReg &dst, } switch (output_type) { + case ov::element::f32: + switch (input_type) { + case ov::element::i32: + cvt_i32_to_f32(src, dst); + break; + case ov::element::f16: + cvt_f16_to_f32(src, dst); + break; + case ov::element::i8: + case ov::element::u8: + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_i32(dst, dst); + cvt_i32_to_f32(dst, dst); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + case ov::element::i32: + switch (input_type) { + case ov::element::f32: + cvt_f32_to_i32(src, dst); + break; + case ov::element::f16: + cvt_f16_to_f32(src, dst); + cvt_f32_to_i32(dst, dst); + break; + case ov::element::i8: + case ov::element::u8: + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_i32(dst, dst); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + case ov::element::f16: + switch (input_type) { + case ov::element::f32: + cvt_f32_to_f16(src, dst); + break; + case ov::element::i32: + cvt_i32_to_f32(src, dst); + cvt_f32_to_f16(dst, dst); + break; + case ov::element::i8: + case ov::element::u8: + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_i32(dst, dst); + cvt_i32_to_f32(dst, dst); + cvt_f32_to_f16(dst, dst); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + case ov::element::i8: + case ov::element::u8: + switch (input_type) { case ov::element::f32: - switch (input_type) { - case ov::element::i32: - cvt_i32_to_f32(src, dst); - break; - case ov::element::f16: - cvt_f16_to_f32(src, dst); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_i32(dst, dst); - cvt_i32_to_f32(dst, dst); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_f32_to_i32(src, dst); + cvt_i32_to_i16(dst, dst, is_saturated); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; case ov::element::i32: - switch (input_type) { - case ov::element::f32: - cvt_f32_to_i32(src, dst); - break; - case ov::element::f16: - cvt_f16_to_f32(src, dst); - cvt_f32_to_i32(dst, dst); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_i32(dst, dst); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_i32_to_i16(src, dst, is_saturated); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; case ov::element::f16: - switch (input_type) { - case ov::element::f32: - cvt_f32_to_f16(src, dst); - break; - case ov::element::i32: - cvt_i32_to_f32(src, dst); - cvt_f32_to_f16(dst, dst); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_i32(dst, dst); - cvt_i32_to_f32(dst, dst); - cvt_f32_to_f16(dst, dst); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_f16_to_f32(src, dst); + cvt_f32_to_i32(dst, dst); + cvt_i32_to_i16(dst, dst, is_saturated); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; case ov::element::i8: case ov::element::u8: - switch (input_type) { - case ov::element::f32: - cvt_f32_to_i32(src, dst); - cvt_i32_to_i16(dst, dst, is_saturated); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - case ov::element::i32: - cvt_i32_to_i16(src, dst, is_saturated); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - case ov::element::f16: - cvt_f16_to_f32(src, dst); - cvt_f32_to_i32(dst, dst); - cvt_i32_to_i16(dst, dst, is_saturated); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; default: - OV_CPU_JIT_EMITTER_THROW("Unsupported output type: ", output_type.get_type_name()); + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported output type: ", output_type.get_type_name()); } } -jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_convert_emitter::jit_convert_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { input_type = node->get_input_element_type(0); output_type = node->get_output_element_type(0); } void jit_convert_emitter::validate_types() const { - OV_CPU_JIT_EMITTER_ASSERT(one_of(input_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported input type: ", input_type.get_type_name()); - OV_CPU_JIT_EMITTER_ASSERT(one_of(output_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported output type: ", output_type.get_type_name()); + OV_CPU_JIT_EMITTER_ASSERT( + one_of(input_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported input type: ", + input_type.get_type_name()); + OV_CPU_JIT_EMITTER_ASSERT( + one_of(output_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported output type: ", + output_type.get_type_name()); } -size_t jit_convert_emitter::get_inputs_count() const { return 1; } +size_t jit_convert_emitter::get_inputs_count() const { + return 1; +} void jit_convert_emitter::emit_data() const { jit_emitter::emit_data(); } -jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { -} +jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) {} -void jit_convert_truncation_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_truncation_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { validate_types(); if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); @@ -224,19 +242,22 @@ void jit_convert_truncation_emitter::emit_impl(const std::vector &in_idx } template -void jit_convert_truncation_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_truncation_emitter::emit_isa(const std::vector& in_idxs, + const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); jit_convert_process(src, dst, input_type, output_type, false); } -jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { -} +jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) {} -void jit_convert_saturation_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_saturation_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { validate_types(); if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); @@ -246,13 +267,14 @@ void jit_convert_saturation_emitter::emit_impl(const std::vector &in_idx } template -void jit_convert_saturation_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_saturation_emitter::emit_isa(const std::vector& in_idxs, + const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); jit_convert_process(src, dst, input_type, output_type, true); } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp index af0310f736e5c9..bc9bb1e5005672 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp @@ -12,8 +12,10 @@ namespace aarch64 { class jit_convert_emitter : public jit_emitter { public: - jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_count() const override; @@ -21,7 +23,10 @@ class jit_convert_emitter : public jit_emitter { void emit_data() const override; void validate_types() const; template - void jit_convert_process(const TReg &src, const TReg &dst, ov::element::Type input_type, ov::element::Type output_type, + void jit_convert_process(const TReg& src, + const TReg& dst, + ov::element::Type input_type, + ov::element::Type output_type, bool is_saturated) const; ov::element::Type input_type; @@ -29,25 +34,25 @@ class jit_convert_emitter : public jit_emitter { private: template - inline void cvt_f16_to_f32(const TReg &src, const TReg &dst) const; + inline void cvt_f16_to_f32(const TReg& src, const TReg& dst) const; template - inline void cvt_f32_to_f16(const TReg &src, const TReg &dst) const; + inline void cvt_f32_to_f16(const TReg& src, const TReg& dst) const; template - inline void cvt_f32_to_i32(const TReg &src, const TReg &dst) const; + inline void cvt_f32_to_i32(const TReg& src, const TReg& dst) const; template - inline void cvt_i32_to_f32(const TReg &src, const TReg &dst) const; + inline void cvt_i32_to_f32(const TReg& src, const TReg& dst) const; template - inline void cvt_i32_to_i16(const TReg &src, const TReg &dst, bool is_saturated) const; + inline void cvt_i32_to_i16(const TReg& src, const TReg& dst, bool is_saturated) const; template - inline void cvt_i16_to_i32(const TReg &src, const TReg &dst) const; + inline void cvt_i16_to_i32(const TReg& src, const TReg& dst) const; template - inline void cvt_f16_to_i16(const TReg &src, const TReg &dst) const; + inline void cvt_f16_to_i16(const TReg& src, const TReg& dst) const; template - inline void cvt_i16_to_f16(const TReg &src, const TReg &dst) const; + inline void cvt_i16_to_f16(const TReg& src, const TReg& dst) const; template - inline void cvt_i16_to_byte(const TReg &src, const TReg &dst, bool is_signed, bool is_saturated) const; + inline void cvt_i16_to_byte(const TReg& src, const TReg& dst, bool is_signed, bool is_saturated) const; template - inline void cvt_byte_to_i16(const TReg &src, const TReg &dst, bool is_signed) const; + inline void cvt_byte_to_i16(const TReg& src, const TReg& dst, bool is_signed) const; }; // This emitter is covered by specification of "Convert" operation. The implementation uses a "warp-around" conversion. @@ -56,13 +61,15 @@ class jit_convert_emitter : public jit_emitter { // 129 -> -127 class jit_convert_truncation_emitter : public jit_convert_emitter { public: - jit_convert_truncation_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_truncation_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; }; // This emitter is covered by the common dnnl behavior. The implementation uses a "saturation" conversion. @@ -71,15 +78,17 @@ class jit_convert_truncation_emitter : public jit_convert_emitter { // 129 -> 127 class jit_convert_saturation_emitter : public jit_convert_emitter { public: - jit_convert_saturation_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_saturation_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 83cdd252f9bc6f..534470c746f2fe 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -3,11 +3,12 @@ // #include "jit_eltwise_emitters.hpp" -#include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include + #include "common/utils.hpp" #include "emitters/utils.hpp" +#include "transformations/cpu_opset/common/op/swish_cpu.hpp" namespace ov { namespace intel_cpu { @@ -21,34 +22,35 @@ namespace { ov::element::Type get_arithmetic_binary_exec_precision(const std::shared_ptr& n) { std::vector input_precisions; for (const auto& input : n->inputs()) { - input_precisions.push_back( - input.get_source_output().get_element_type()); + input_precisions.push_back(input.get_source_output().get_element_type()); } - assert(std::all_of( - input_precisions.begin(), - input_precisions.end(), - [&input_precisions](const ov::element::Type& precision) {return precision == input_precisions[0]; })); + assert(std::all_of(input_precisions.begin(), + input_precisions.end(), + [&input_precisions](const ov::element::Type& precision) { + return precision == input_precisions[0]; + })); return input_precisions[0]; } -} // namespace +} // namespace /// ABS /// jit_abs_emitter::jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_abs_emitter::jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_abs_emitter::get_inputs_count() const { return 1; } +size_t jit_abs_emitter::get_inputs_count() const { + return 1; +} -void jit_abs_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_abs_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -57,7 +59,7 @@ void jit_abs_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_abs_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_abs_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -75,17 +77,18 @@ std::set> jit_abs_emitter::get_supported_precisions(c jit_add_emitter::jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_add_emitter::jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_add_emitter::get_inputs_count() const { return 2; } +size_t jit_add_emitter::get_inputs_count() const { + return 2; +} -void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -94,7 +97,7 @@ void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -113,7 +116,7 @@ std::set> jit_add_emitter::get_supported_precisions(c jit_clamp_emitter::jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { const auto clamp = std::dynamic_pointer_cast(node); if (clamp == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v0::Clamp"); @@ -129,24 +132,31 @@ jit_clamp_emitter::jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* ho const float min, const float max, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), - min(min), - max(max) { + : jit_emitter(host, host_isa, exec_prc), + min(min), + max(max) { prepare_table(); } -size_t jit_clamp_emitter::get_inputs_count() const { return 1; } +size_t jit_clamp_emitter::get_inputs_count() const { + return 1; +} -size_t jit_clamp_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_clamp_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_clamp_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_clamp_emitter::get_aux_gprs_count() const { + return 1; +} void jit_clamp_emitter::register_table_entries() { push_arg_entry_of("min", dnnl::impl::float2int(min), true); push_arg_entry_of("max", dnnl::impl::float2int(max), true); } -void jit_clamp_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_clamp_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -155,7 +165,8 @@ void jit_clamp_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_clamp_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_clamp_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -169,24 +180,28 @@ void jit_clamp_emitter::emit_isa(const std::vector &in_vec_idxs, const s h->fmin(dst.s, dst.s, aux.s); } -std::set> jit_clamp_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_clamp_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } /// DIVIDE /// -jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) {} +jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_divide_emitter::get_inputs_count() const { return 2; } +size_t jit_divide_emitter::get_inputs_count() const { + return 2; +} -void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -195,7 +210,8 @@ void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -206,35 +222,44 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const h->uni_fdiv(dst.s, src0.s, src1.s); } -std::set> jit_divide_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_divide_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// EQUAL /// -jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } -jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_equal_emitter::get_inputs_count() const { return 2; } +size_t jit_equal_emitter::get_inputs_count() const { + return 2; +} -size_t jit_equal_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_equal_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_equal_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_equal_emitter::get_aux_gprs_count() const { + return 1; +} -std::set> jit_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -243,7 +268,8 @@ void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -266,7 +292,7 @@ void jit_equal_emitter::register_table_entries() { jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { const auto elu = std::dynamic_pointer_cast(node); if (elu == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v0::Clamp"); @@ -280,12 +306,16 @@ jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float alpha, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc), alpha(alpha) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc), + alpha(alpha) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_elu_emitter::get_inputs_count() const { return 1; } +size_t jit_elu_emitter::get_inputs_count() const { + return 1; +} size_t jit_elu_emitter::get_aux_vecs_count() const { return std::max(exp_emitter->get_aux_vecs_count() + 1ull, 2ull); @@ -295,7 +325,7 @@ size_t jit_elu_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_elu_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_elu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -304,7 +334,7 @@ void jit_elu_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_elu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_elu_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -315,11 +345,7 @@ void jit_elu_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->mov(vmm_aux1.b16, vmm_src.b16); // compute exponent - exp_emitter->emit_code( - { vmm_src.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_src.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); // alpha * (exp(x) - 1) const TReg vmm_aux0(aux_vec_idxs[0]); @@ -351,23 +377,30 @@ std::set> jit_elu_emitter::get_supported_precisions(c jit_exp_emitter::jit_exp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_exp_emitter::jit_exp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_exp_emitter::get_inputs_count() const { return 1; } +size_t jit_exp_emitter::get_inputs_count() const { + return 1; +} -size_t jit_exp_emitter::get_aux_vecs_count() const { return 4; } +size_t jit_exp_emitter::get_aux_vecs_count() const { + return 4; +} -size_t jit_exp_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_exp_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -376,7 +409,7 @@ void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (exec_prc_ != ov::element::f32) { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); } @@ -484,17 +517,19 @@ std::set> jit_exp_emitter::get_supported_precisions(c jit_floor_emitter::jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_floor_emitter::jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_floor_emitter::get_inputs_count() const { return 1; } +size_t jit_floor_emitter::get_inputs_count() const { + return 1; +} -void jit_floor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -503,7 +538,8 @@ void jit_floor_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -512,29 +548,83 @@ void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const s h->frintm(dst.s, src.s); } -std::set> jit_floor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } +/// FLOOR_MOD /// +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} + +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} + +size_t jit_floor_mod_emitter::get_inputs_count() const { + return 2; +} + +size_t jit_floor_mod_emitter::get_aux_vecs_count() const { + return 1; +} + +void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_floor_mod_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg dividend = TReg(in_vec_idxs[0]); + TReg divisor = TReg(in_vec_idxs[1]); + TReg r = TReg(out_vec_idxs[0]); + TReg aux = TReg(aux_vec_idxs[0]); + + h->fdiv(aux.s, dividend.s, divisor.s); + h->frintm(aux.s, aux.s); + h->fmul(aux.s, aux.s, divisor.s); + h->fsub(r.s, dividend.s, aux.s); +} + +std::set> jit_floor_mod_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + /// CEILING /// -//Initialization of the emitter, taking node as input +// Initialization of the emitter, taking node as input jit_ceiling_emitter::jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -//Initialization of emitter, without taking node as input +// Initialization of emitter, without taking node as input jit_ceiling_emitter::jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -//This will tell the JIT compiler that how many inputs the ceiling operation requires (here 1) -size_t jit_ceiling_emitter::get_inputs_count() const { return 1; } +// This will tell the JIT compiler that how many inputs the ceiling operation requires (here 1) +size_t jit_ceiling_emitter::get_inputs_count() const { + return 1; +} -//Main implementation method that emits the JIT code -void jit_ceiling_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +// Main implementation method that emits the JIT code +void jit_ceiling_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -545,7 +635,8 @@ void jit_ceiling_emitter::emit_impl(const std::vector &in_vec_idxs, cons // Template method that generates actual instruction sequence for ceiling operation // The h->frintp() method rounds up the floating value to the nearest integer. template -void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_ceiling_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -556,7 +647,8 @@ void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const // Template method that generates actual instruction sequence for ceiling operation // Currently only supports 32-bit floating point (f32) -std::set> jit_ceiling_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_ceiling_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -564,19 +656,22 @@ std::set> jit_ceiling_emitter::get_supported_precisio jit_gelu_erf_emitter::jit_gelu_erf_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, node); } jit_gelu_erf_emitter::jit_gelu_erf_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_gelu_erf_emitter::get_inputs_count() const { return 1; } +size_t jit_gelu_erf_emitter::get_inputs_count() const { + return 1; +} size_t jit_gelu_erf_emitter::get_aux_vecs_count() const { return std::max(exp_emitter->get_aux_vecs_count() + 3, 7); @@ -586,7 +681,8 @@ size_t jit_gelu_erf_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_gelu_erf_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_erf_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -595,7 +691,8 @@ void jit_gelu_erf_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_gelu_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_erf_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -628,11 +725,7 @@ void jit_gelu_erf_emitter::emit_isa(const std::vector &in_vec_idxs, cons h->fmul(vmm_aux.s, vmm_aux0.s, vmm_aux0.s); h->ld1r(vmm_aux2.s, table_val2("sign_mask")); h->orr(vmm_aux.b16, vmm_aux.b16, vmm_aux2.b16); - exp_emitter->emit_code( - { vmm_aux.getIdx() }, - { vmm_aux_dst.getIdx() }, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_aux.getIdx()}, {vmm_aux_dst.getIdx()}, aux_vec_idxs, aux_gpr_idxs); h->ld1r(vmm_aux2.s, table_val2("sign_mask")); // vmm_aux_dst = -exp(-x*x) h->orr(vmm_aux_dst.b16, vmm_aux_dst.b16, vmm_aux2.b16); @@ -678,11 +771,11 @@ void jit_gelu_erf_emitter::register_table_entries() { push_arg_entry_of("gelu_erf_one_over_sqrt_two", 0x3f3504f3, true); push_arg_entry_of("gelu_erf_one_over_sqrt_pi", 0x3f106eba, true); - push_arg_entry_of("erf_pol1", 0x3e827906, true); // p1 = 0.254829592f - push_arg_entry_of("erf_pol2", 0xbe91a98e, true); // p2 = -0.284496736f - push_arg_entry_of("erf_pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f - push_arg_entry_of("erf_pol4", 0xbfba00e3, true); // p4 = -1.453152027f - push_arg_entry_of("erf_pol5", 0x3f87dc22, true); // p5 = 1.061405429f + push_arg_entry_of("erf_pol1", 0x3e827906, true); // p1 = 0.254829592f + push_arg_entry_of("erf_pol2", 0xbe91a98e, true); // p2 = -0.284496736f + push_arg_entry_of("erf_pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f + push_arg_entry_of("erf_pol4", 0xbfba00e3, true); // p4 = -1.453152027f + push_arg_entry_of("erf_pol5", 0x3f87dc22, true); // p5 = 1.061405429f } void jit_gelu_erf_emitter::emit_data() const { @@ -690,7 +783,8 @@ void jit_gelu_erf_emitter::emit_data() const { exp_emitter->emit_data(); } -std::set> jit_gelu_erf_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_gelu_erf_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -698,19 +792,22 @@ std::set> jit_gelu_erf_emitter::get_supported_precisi jit_gelu_tanh_emitter::jit_gelu_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); tanh_emitter = std::make_unique(h, host_isa, node); } jit_gelu_tanh_emitter::jit_gelu_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); tanh_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_gelu_tanh_emitter::get_inputs_count() const { return 1; } +size_t jit_gelu_tanh_emitter::get_inputs_count() const { + return 1; +} size_t jit_gelu_tanh_emitter::get_aux_vecs_count() const { return std::max(tanh_emitter->get_aux_vecs_count() + 2, 3); @@ -720,7 +817,8 @@ size_t jit_gelu_tanh_emitter::get_aux_gprs_count() const { return tanh_emitter->get_aux_gprs_count() + 1; } -void jit_gelu_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_tanh_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -729,7 +827,8 @@ void jit_gelu_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, co } template -void jit_gelu_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_tanh_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -749,11 +848,7 @@ void jit_gelu_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, con h->ld1r(vmm_aux1.s, table_val2("gelu_tanh_sqrt_two_over_pi")); h->fmul(vmm_aux0.s, vmm_aux1.s, vmm_aux2.s); - tanh_emitter->emit_code( - { vmm_aux0.getIdx() }, - { vmm_aux2.getIdx() }, - aux_vec_idxs, - aux_gpr_idxs); + tanh_emitter->emit_code({vmm_aux0.getIdx()}, {vmm_aux2.getIdx()}, aux_vec_idxs, aux_gpr_idxs); // compute 0.5 * x * (1 + tanh(G(x))) h->ld1r(vmm_aux1.s, table_val2("one")); @@ -776,7 +871,8 @@ void jit_gelu_tanh_emitter::emit_data() const { tanh_emitter->emit_data(); } -std::set> jit_gelu_tanh_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_gelu_tanh_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -784,7 +880,7 @@ std::set> jit_gelu_tanh_emitter::get_supported_precis jit_greater_emitter::jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -795,13 +891,20 @@ jit_greater_emitter::jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator prepare_table(); } -size_t jit_greater_emitter::get_inputs_count() const { return 2; } +size_t jit_greater_emitter::get_inputs_count() const { + return 2; +} -size_t jit_greater_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_greater_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_greater_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_greater_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -810,7 +913,8 @@ void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -828,7 +932,8 @@ void jit_greater_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_greater_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -836,7 +941,7 @@ std::set> jit_greater_emitter::get_supported_precisio jit_greater_equal_emitter::jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -847,13 +952,20 @@ jit_greater_equal_emitter::jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::j prepare_table(); } -size_t jit_greater_equal_emitter::get_inputs_count() const { return 2; } +size_t jit_greater_equal_emitter::get_inputs_count() const { + return 2; +} -size_t jit_greater_equal_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_greater_equal_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_greater_equal_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_greater_equal_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_greater_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -862,7 +974,8 @@ void jit_greater_equal_emitter::emit_impl(const std::vector &in_vec_idxs } template -void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -880,31 +993,40 @@ void jit_greater_equal_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_greater_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// HARD_SWISH /// jit_hswish_emitter::jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_hswish_emitter::jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_hswish_emitter::get_inputs_count() const { return 1; } +size_t jit_hswish_emitter::get_inputs_count() const { + return 1; +} -size_t jit_hswish_emitter::get_aux_vecs_count() const { return 2; } +size_t jit_hswish_emitter::get_aux_vecs_count() const { + return 2; +} -size_t jit_hswish_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_hswish_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_hswish_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_hswish_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -913,7 +1035,8 @@ void jit_hswish_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_hswish_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_hswish_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -938,18 +1061,19 @@ void jit_hswish_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); push_arg_entry_of("three", 0x40400000, true); push_arg_entry_of("six", 0x40c00000, true); - push_arg_entry_of("one_sixth", dnnl::impl::float2int(1.f/6.f), true); + push_arg_entry_of("one_sixth", dnnl::impl::float2int(1.f / 6.f), true); } -std::set> jit_hswish_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_hswish_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } /// IS_FINITE /// jit_is_finite_emitter::jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { auto isNaN = ov::as_type_ptr(node); if (isNaN == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v10::IsNaN"); @@ -959,23 +1083,31 @@ jit_is_finite_emitter::jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_gener } jit_is_finite_emitter::jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_is_finite_emitter::get_inputs_count() const { return 1; } +size_t jit_is_finite_emitter::get_inputs_count() const { + return 1; +} -size_t jit_is_finite_emitter::get_aux_vecs_count() const { return 2; } +size_t jit_is_finite_emitter::get_aux_vecs_count() const { + return 2; +} -size_t jit_is_finite_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_is_finite_emitter::get_aux_gprs_count() const { + return 1; +} -std::set> jit_is_finite_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_is_finite_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -984,7 +1116,8 @@ void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -994,7 +1127,8 @@ void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, con TReg aux0 = TReg(aux_vec_idxs[0]); TReg aux1 = TReg(aux_vec_idxs[1]); - // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always false. + // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always + // false. h->fcmeq(aux0.s, src.s, src.s); h->not_(aux0.b16, aux0.b16); @@ -1023,7 +1157,6 @@ jit_is_inf_emitter::jit_is_inf_emitter(dnnl::impl::cpu::aarch64::jit_generator* dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { - auto isInf = ov::as_type_ptr(node); if (isInf == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v10::IsInf"); @@ -1118,9 +1251,9 @@ void jit_is_inf_emitter::register_table_entries() { /// IS_NAN /// jit_is_nan_emitter::jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { auto isNaN = ov::as_type_ptr(node); if (isNaN == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v10::IsNaN"); @@ -1130,23 +1263,31 @@ jit_is_nan_emitter::jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* } jit_is_nan_emitter::jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_is_nan_emitter::get_inputs_count() const { return 1; } +size_t jit_is_nan_emitter::get_inputs_count() const { + return 1; +} -size_t jit_is_nan_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_is_nan_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_is_nan_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_is_nan_emitter::get_aux_gprs_count() const { + return 1; +} -std::set> jit_is_nan_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_is_nan_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1155,7 +1296,8 @@ void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1164,7 +1306,8 @@ void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const TReg dst = TReg(out_vec_idxs[0]); TReg aux = TReg(aux_vec_idxs[0]); - // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always false. + // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always + // false. h->fcmeq(dst.s, src.s, src.s); h->ld1r(aux.s, table_val2("zero")); h->fcmeq(dst.s, dst.s, aux.s); @@ -1183,7 +1326,7 @@ void jit_is_nan_emitter::register_table_entries() { jit_less_equal_emitter::jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -1194,13 +1337,20 @@ jit_less_equal_emitter::jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_gen prepare_table(); } -size_t jit_less_equal_emitter::get_inputs_count() const { return 2; } +size_t jit_less_equal_emitter::get_inputs_count() const { + return 2; +} -size_t jit_less_equal_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_less_equal_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_less_equal_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_less_equal_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_less_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1209,7 +1359,8 @@ void jit_less_equal_emitter::emit_impl(const std::vector &in_vec_idxs, c } template -void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1228,7 +1379,8 @@ void jit_less_equal_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_less_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_less_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1236,7 +1388,7 @@ std::set> jit_less_equal_emitter::get_supported_preci jit_logical_and_emitter::jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -1247,13 +1399,20 @@ jit_logical_and_emitter::jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_g prepare_table(); } -size_t jit_logical_and_emitter::get_inputs_count() const { return 2; } +size_t jit_logical_and_emitter::get_inputs_count() const { + return 2; +} -size_t jit_logical_and_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_logical_and_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_logical_and_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_logical_and_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_logical_and_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1262,7 +1421,8 @@ void jit_logical_and_emitter::emit_impl(const std::vector &in_vec_idxs, } template -void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_and_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1280,24 +1440,86 @@ void jit_logical_and_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_logical_and_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_and_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -/// LOGICAL_NOT /// -jit_logical_not_emitter::jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, +/// LOGICAL_OR /// +jit_logical_or_emitter::jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { - prepare_table(); - } + prepare_table(); +} -jit_logical_not_emitter::jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, +jit_logical_or_emitter::jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { - prepare_table(); + prepare_table(); +} + +size_t jit_logical_or_emitter::get_inputs_count() const { + return 2; +} + +size_t jit_logical_or_emitter::get_aux_vecs_count() const { + return 1; +} + +size_t jit_logical_or_emitter::get_aux_gprs_count() const { + return 1; +} + +void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); } +} + +template +void jit_logical_or_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + const TReg src1 = TReg(in_vec_idxs[0]); + const TReg src2 = TReg(in_vec_idxs[1]); + const TReg dst = TReg(out_vec_idxs[0]); + const TReg aux = TReg(aux_vec_idxs[0]); + + h->orr(dst.b16, src1.b16, src2.b16); + h->ld1r(aux.s, table_val2("one")); + h->and_(dst.b16, dst.b16, aux.b16); +} + +void jit_logical_or_emitter::register_table_entries() { + push_arg_entry_of("one", 0x3f800000, true); +} + +std::set> jit_logical_or_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + +/// LOGICAL_NOT /// +jit_logical_not_emitter::jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + prepare_table(); +} + +jit_logical_not_emitter::jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} size_t jit_logical_not_emitter::get_inputs_count() const { return 1; @@ -1312,7 +1534,7 @@ size_t jit_logical_not_emitter::get_aux_gprs_count() const { } void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, - const std::vector& out_vec_idxs) const { + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1322,7 +1544,7 @@ void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, template void jit_logical_not_emitter::emit_isa(const std::vector& in_vec_idxs, - const std::vector& out_vec_idxs) const { + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1350,7 +1572,7 @@ std::set> jit_logical_not_emitter::get_supported_prec jit_logical_xor_emitter::jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -1361,13 +1583,20 @@ jit_logical_xor_emitter::jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_g prepare_table(); } -size_t jit_logical_xor_emitter::get_inputs_count() const { return 2; } +size_t jit_logical_xor_emitter::get_inputs_count() const { + return 2; +} -size_t jit_logical_xor_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_logical_xor_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_logical_xor_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_logical_xor_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_logical_xor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1376,7 +1605,8 @@ void jit_logical_xor_emitter::emit_impl(const std::vector &in_vec_idxs, } template -void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_xor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1394,7 +1624,8 @@ void jit_logical_xor_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_logical_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_xor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1402,17 +1633,19 @@ std::set> jit_logical_xor_emitter::get_supported_prec jit_maximum_emitter::jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_maximum_emitter::jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_maximum_emitter::get_inputs_count() const { return 2; } +size_t jit_maximum_emitter::get_inputs_count() const { + return 2; +} -void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1421,7 +1654,8 @@ void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1432,7 +1666,8 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const h->fmaxnm(dst.s, src1.s, src2.s); } -std::set> jit_maximum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_maximum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1440,17 +1675,19 @@ std::set> jit_maximum_emitter::get_supported_precisio jit_minimum_emitter::jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_minimum_emitter::jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_minimum_emitter::get_inputs_count() const { return 2; } +size_t jit_minimum_emitter::get_inputs_count() const { + return 2; +} -void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1459,7 +1696,8 @@ void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1470,7 +1708,8 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const h->fminnm(dst.s, src1.s, src2.s); } -std::set> jit_minimum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_minimum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1478,19 +1717,22 @@ std::set> jit_minimum_emitter::get_supported_precisio jit_mish_emitter::jit_mish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, node); } jit_mish_emitter::jit_mish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_mish_emitter::get_inputs_count() const { return 1; } +size_t jit_mish_emitter::get_inputs_count() const { + return 1; +} size_t jit_mish_emitter::get_aux_vecs_count() const { return std::max(exp_emitter->get_aux_vecs_count() + 1, 2); @@ -1500,7 +1742,8 @@ size_t jit_mish_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_mish_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mish_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1509,7 +1752,7 @@ void jit_mish_emitter::emit_impl(const std::vector &in_vec_idxs, const s } template -void jit_mish_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mish_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); // An equation other than mish(x) = x*tanh(srelu(x)) was used @@ -1531,11 +1774,7 @@ void jit_mish_emitter::emit_isa(const std::vector &in_vec_idxs, const st h->ld1r(vmm_aux0.s, table_val2("fwd_mish_max_x_for_equation_f")); h->fminnm(vmm_aux2.s, vmm_src.s, vmm_aux0.s); - exp_emitter->emit_code( - { vmm_aux2.getIdx() }, - { vmm_aux2.getIdx() }, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_aux2.getIdx()}, {vmm_aux2.getIdx()}, aux_vec_idxs, aux_gpr_idxs); // (e^x+1)^2 h->fmov(vmm_aux0.s, 1.f); @@ -1568,22 +1807,25 @@ std::set> jit_mish_emitter::get_supported_precisions( } /// MOD /// -jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc): jit_emitter(host, host_isa, exec_prc) { -} +jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mod_emitter::get_inputs_count() const { return 2; } +size_t jit_mod_emitter::get_inputs_count() const { + return 2; +} -size_t jit_mod_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_mod_emitter::get_aux_vecs_count() const { + return 1; +} -void jit_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1592,7 +1834,7 @@ void jit_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mod_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1616,20 +1858,23 @@ std::set> jit_mod_emitter::get_supported_precisions(c jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mul_add_emitter::get_inputs_count() const { return 3; } +size_t jit_mul_add_emitter::get_inputs_count() const { + return 3; +} -size_t jit_mul_add_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_mul_add_emitter::get_aux_vecs_count() const { + return 1; +} -void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1638,7 +1883,8 @@ void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1668,24 +1914,28 @@ void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const h->fmla(dst.s, mul0.s, mul1.s); } -std::set> jit_mul_add_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_mul_add_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}}; } /// MULTIPLY /// -jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_multiply_emitter::get_inputs_count() const { return 2; } +size_t jit_multiply_emitter::get_inputs_count() const { + return 2; +} -void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1694,7 +1944,8 @@ void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1705,16 +1956,17 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons h->uni_fmul(dst.s, src0.s, src1.s); } -std::set> jit_multiply_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_multiply_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// POWER /// -jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, node, exec_prc) { + : jit_emitter(host, host_isa, node, exec_prc) { auto powerStaticNode = ov::as_type_ptr(node); if (powerStaticNode == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to snippets::op::PowerStatic"); @@ -1727,24 +1979,30 @@ jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit prepare_table(); } -jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float power, const float scale, const float shift, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), - power(power), - scale(scale), - shift(shift) { + : jit_emitter(host, host_isa, exec_prc), + power(power), + scale(scale), + shift(shift) { prepare_table(); } -size_t jit_power_static_emitter::get_inputs_count() const { return 1; } +size_t jit_power_static_emitter::get_inputs_count() const { + return 1; +} -size_t jit_power_static_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_power_static_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_power_static_emitter::get_aux_gprs_count() const { return 2; } +size_t jit_power_static_emitter::get_aux_gprs_count() const { + return 2; +} void jit_power_static_emitter::register_table_entries() { push_arg_entry_of("power", dnnl::impl::float2int(power), true); @@ -1752,11 +2010,13 @@ void jit_power_static_emitter::register_table_entries() { push_arg_entry_of("shift", dnnl::impl::float2int(shift), true); } -std::set> jit_power_static_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_static_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1765,7 +2025,8 @@ void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_power_static_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1848,26 +2109,30 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, /// PRELU /// jit_prelu_emitter::jit_prelu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_prelu_emitter::jit_prelu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_prelu_emitter::get_inputs_count() const { return 2; } +size_t jit_prelu_emitter::get_inputs_count() const { + return 2; +} -size_t jit_prelu_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_prelu_emitter::get_aux_vecs_count() const { + return 1; +} -std::set> jit_prelu_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_prelu_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1876,7 +2141,8 @@ void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_prelu_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1895,24 +2161,27 @@ void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const s jit_relu_emitter::jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_relu_emitter::jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_relu_emitter::get_inputs_count() const { return 1; } +size_t jit_relu_emitter::get_inputs_count() const { + return 1; +} -size_t jit_relu_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_relu_emitter::get_aux_vecs_count() const { + return 1; +} std::set> jit_relu_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_relu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_relu_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1921,7 +2190,7 @@ void jit_relu_emitter::emit_impl(const std::vector& in_vec_idxs, const s } template -void jit_relu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_relu_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1934,27 +2203,117 @@ void jit_relu_emitter::emit_isa(const std::vector &in_vec_idxs, const st h->fmaxnm(dst.s, src.s, tmp.s); } +/// ROUND_HALF_AWAY_FROM_ZERO /// +jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter( + dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} + +jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter( + dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} + +size_t jit_round_half_away_from_zero_emitter::get_inputs_count() const { + return 1; +} + +std::set> jit_round_half_away_from_zero_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::f32}}; +} + +void jit_round_half_away_from_zero_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_round_half_away_from_zero_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg src = TReg(in_vec_idxs[0]); + TReg dst = TReg(out_vec_idxs[0]); + + h->frinta(dst.s, src.s); +} + +/// ROUND_HALF_TO_EVEN /// +jit_round_half_to_even_emitter::jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} + +jit_round_half_to_even_emitter::jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} + +size_t jit_round_half_to_even_emitter::get_inputs_count() const { + return 1; +} + +std::set> jit_round_half_to_even_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::f32}}; +} + +void jit_round_half_to_even_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_round_half_to_even_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg src = TReg(in_vec_idxs[0]); + TReg dst = TReg(out_vec_idxs[0]); + + h->frintn(dst.s, src.s); +} + /// SELECT /// -jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { -} -jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_select_emitter::get_inputs_count() const { return 3; } +size_t jit_select_emitter::get_inputs_count() const { + return 3; +} -size_t jit_select_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_select_emitter::get_aux_vecs_count() const { + return 1; +} -std::set> jit_select_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_select_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}}; } -void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1963,7 +2322,8 @@ void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_select_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1984,19 +2344,22 @@ void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const jit_sigmoid_emitter::jit_sigmoid_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, node); } jit_sigmoid_emitter::jit_sigmoid_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_sigmoid_emitter::get_inputs_count() const { return 1; } +size_t jit_sigmoid_emitter::get_inputs_count() const { + return 1; +} size_t jit_sigmoid_emitter::get_aux_vecs_count() const { return exp_emitter->get_aux_vecs_count() + 2; @@ -2006,7 +2369,8 @@ size_t jit_sigmoid_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_sigmoid_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_sigmoid_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2015,7 +2379,8 @@ void jit_sigmoid_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_sigmoid_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_sigmoid_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (exec_prc_ != ov::element::f32) { OPENVINO_THROW("unsupported precision: " + exec_prc_.to_string()); } @@ -2038,11 +2403,7 @@ void jit_sigmoid_emitter::emit_isa(const std::vector &in_vec_idxs, const h->ld1r(vmm_aux0.s, table_val2("sign_mask")); h->orr(vmm_aux0.b16, vmm_src.b16, vmm_aux0.b16); - exp_emitter->emit_code( - { vmm_aux0.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_aux0.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); const TReg vmm_aux1(aux_vec_idxs[0]); const TReg vmm_aux2(aux_vec_idxs[1]); @@ -2072,7 +2433,8 @@ void jit_sigmoid_emitter::emit_data() const { exp_emitter->emit_data(); } -std::set> jit_sigmoid_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_sigmoid_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -2080,23 +2442,31 @@ std::set> jit_sigmoid_emitter::get_supported_precisio jit_soft_sign_emitter::jit_soft_sign_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_soft_sign_emitter::jit_soft_sign_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_soft_sign_emitter::get_inputs_count() const { return 1; } +size_t jit_soft_sign_emitter::get_inputs_count() const { + return 1; +} -size_t jit_soft_sign_emitter::get_aux_vecs_count() const { return 2; } +size_t jit_soft_sign_emitter::get_aux_vecs_count() const { + return 2; +} -size_t jit_soft_sign_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_soft_sign_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_soft_sign_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2105,7 +2475,8 @@ void jit_soft_sign_emitter::emit_impl(const std::vector &in_vec_idxs, co } template -void jit_soft_sign_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_soft_sign_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (exec_prc_ != ov::element::f32) { OPENVINO_THROW("unsupported precision: " + exec_prc_.to_string()); } @@ -2126,7 +2497,8 @@ void jit_soft_sign_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_soft_sign_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_soft_sign_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -2135,15 +2507,15 @@ jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { - prepare_table(); - } + prepare_table(); +} jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { - prepare_table(); - } + prepare_table(); +} size_t jit_sqrt_emitter::get_inputs_count() const { return 1; @@ -2159,8 +2531,7 @@ void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, - const std::vector& out_vec_idxs) const { +void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2170,8 +2541,7 @@ void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, h->fsqrt(dst.s, src.s); } -std::set> jit_sqrt_emitter::get_supported_precisions( - const std::shared_ptr& node) { +std::set> jit_sqrt_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -2179,17 +2549,19 @@ std::set> jit_sqrt_emitter::get_supported_precisions( jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_subtract_emitter::get_inputs_count() const { return 2; } +size_t jit_subtract_emitter::get_inputs_count() const { + return 2; +} -void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2198,7 +2570,8 @@ void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2209,7 +2582,8 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons h->uni_fsub(dst.s, src0.s, src1.s); } -std::set> jit_subtract_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_subtract_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -2217,7 +2591,7 @@ std::set> jit_subtract_emitter::get_supported_precisi jit_swish_emitter::jit_swish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { const auto swish = std::dynamic_pointer_cast(node); if (swish == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to SwishNode"); @@ -2232,12 +2606,15 @@ jit_swish_emitter::jit_swish_emitter(dnnl::impl::cpu::aarch64::jit_generator* ho dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float beta, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), beta(beta) { + : jit_emitter(host, host_isa, exec_prc), + beta(beta) { prepare_table(); sigmoid_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_swish_emitter::get_inputs_count() const {return 1; } +size_t jit_swish_emitter::get_inputs_count() const { + return 1; +} size_t jit_swish_emitter::get_aux_vecs_count() const { return sigmoid_emitter->get_aux_vecs_count() + 2; @@ -2247,7 +2624,8 @@ size_t jit_swish_emitter::get_aux_gprs_count() const { return sigmoid_emitter->get_aux_gprs_count() + 1; } -void jit_swish_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_swish_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2256,7 +2634,8 @@ void jit_swish_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_swish_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_swish_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2272,11 +2651,7 @@ void jit_swish_emitter::emit_isa(const std::vector &in_vec_idxs, const s h->fmul(vmm_aux.s, vmm_aux.s, vmm_src.s); // sigmoid(x*beta) - sigmoid_emitter->emit_code( - { vmm_aux.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + sigmoid_emitter->emit_code({vmm_aux.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); // x*sigmoid(x*beta) h->fmul(vmm_dst.s, vmm_dst.s, vmm_orig_src.s); @@ -2291,28 +2666,31 @@ void jit_swish_emitter::emit_data() const { sigmoid_emitter->emit_data(); } -std::set> jit_swish_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_swish_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } /// TANH /// -jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); sigmoid_emitter = std::make_unique(h, host_isa, node); } -jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); sigmoid_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_tanh_emitter::get_inputs_count() const { return 1; } +size_t jit_tanh_emitter::get_inputs_count() const { + return 1; +} size_t jit_tanh_emitter::get_aux_vecs_count() const { return sigmoid_emitter->get_aux_vecs_count() + 1; @@ -2322,7 +2700,8 @@ size_t jit_tanh_emitter::get_aux_gprs_count() const { return sigmoid_emitter->get_aux_gprs_count() + 1; } -void jit_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_tanh_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2331,7 +2710,7 @@ void jit_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, const s } template -void jit_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_tanh_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2343,11 +2722,7 @@ void jit_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, const st h->ld1r(aux.s, table_val2("two")); h->uni_fmul(aux.s, src.s, aux.s); - sigmoid_emitter->emit_code( - { aux.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + sigmoid_emitter->emit_code({aux.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); h->ld1r(aux.s, table_val2("two")); h->uni_fmul(dst.s, aux.s, dst.s); @@ -2369,6 +2744,6 @@ std::set> jit_tanh_emitter::get_supported_precisions( return {{element::f32}}; } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index fa4f4141c388e4..13567b6fbf7d64 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -12,55 +12,57 @@ namespace aarch64 { class jit_abs_emitter : public jit_emitter { public: - jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_add_emitter : public jit_emitter { public: - jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_clamp_emitter : public jit_emitter { public: - jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float min, const float max, const ov::element::Type exec_prc = ov::element::f32); - jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -72,46 +74,48 @@ class jit_clamp_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: float min; float max; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_divide_emitter : public jit_emitter { public: - jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); - jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_equal_emitter : public jit_emitter { public: - jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -122,13 +126,13 @@ class jit_equal_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; @@ -151,13 +155,14 @@ class jit_exp_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_elu_emitter : public jit_emitter { @@ -181,48 +186,72 @@ class jit_elu_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; float alpha; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_floor_emitter : public jit_emitter { public: - jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; +class jit_floor_mod_emitter : public jit_emitter { +public: + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; + + template + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; +}; class jit_ceiling_emitter : public jit_emitter { public: // Constructor with explicit precision - jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); // Constructor from node - jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -235,13 +264,11 @@ class jit_ceiling_emitter : public jit_emitter { private: // Implementation of JIT code emission - void emit_impl(const std::vector &in_vec_idxs, - const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; // ISA-specific implementation template - void emit_isa(const std::vector &in_vec_idxs, - const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_gelu_erf_emitter : public jit_emitter { @@ -264,15 +291,16 @@ class jit_gelu_erf_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_tanh_emitter; @@ -297,24 +325,25 @@ class jit_gelu_tanh_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr tanh_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_greater_emitter : public jit_emitter { public: - jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -325,24 +354,24 @@ class jit_greater_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_greater_equal_emitter : public jit_emitter { public: - jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -353,24 +382,24 @@ class jit_greater_equal_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_hswish_emitter : public jit_emitter { public: - jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -382,24 +411,25 @@ class jit_hswish_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_is_finite_emitter : public jit_emitter { public: - jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); - jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; @@ -407,24 +437,25 @@ class jit_is_finite_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_is_nan_emitter : public jit_emitter { public: - jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -434,57 +465,60 @@ class jit_is_nan_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_maximum_emitter : public jit_emitter { public: - jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_minimum_emitter : public jit_emitter { public: - jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_mish_emitter : public jit_emitter { @@ -507,15 +541,16 @@ class jit_mish_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_is_inf_emitter : public jit_emitter { @@ -553,11 +588,11 @@ class jit_is_inf_emitter : public jit_emitter { class jit_less_equal_emitter : public jit_emitter { public: - jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -568,24 +603,24 @@ class jit_less_equal_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_logical_and_emitter : public jit_emitter { public: - jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -596,26 +631,54 @@ class jit_logical_and_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; -class jit_logical_not_emitter : public jit_emitter { +class jit_logical_or_emitter : public jit_emitter { public: - jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + const std::shared_ptr& n); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + size_t get_aux_gprs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; + + template + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; + + void register_table_entries() override; +}; + +class jit_logical_not_emitter : public jit_emitter { +public: + jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; @@ -637,11 +700,11 @@ class jit_logical_not_emitter : public jit_emitter { class jit_logical_xor_emitter : public jit_emitter { public: - jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -652,24 +715,24 @@ class jit_logical_xor_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_mod_emitter : public jit_emitter { public: - jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -677,13 +740,14 @@ class jit_mod_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_mul_add_emitter : public jit_emitter { @@ -700,49 +764,50 @@ class jit_mul_add_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_multiply_emitter : public jit_emitter { public: - jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_power_static_emitter : public jit_emitter { public: jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const float power, - const float scale, - const float shift, - const ov::element::Type exec_prc = ov::element::f32); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const float power, + const float scale, + const float shift, + const ov::element::Type exec_prc = ov::element::f32); jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node, - const ov::element::Type exec_prc = ov::element::f32); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_count() const override; @@ -752,16 +817,17 @@ class jit_power_static_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: float power; float scale; float shift; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_prelu_emitter : public jit_emitter { @@ -778,13 +844,14 @@ class jit_prelu_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_relu_emitter : public jit_emitter { @@ -801,22 +868,67 @@ class jit_relu_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; + + template + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; +}; + +class jit_round_half_away_from_zero_emitter : public jit_emitter { +public: + jit_round_half_away_from_zero_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_round_half_away_from_zero_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; + + template + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; +}; + +class jit_round_half_to_even_emitter : public jit_emitter { +public: + jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_select_emitter : public jit_emitter { public: - jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -825,13 +937,13 @@ class jit_select_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_sigmoid_emitter : public jit_emitter { @@ -854,15 +966,16 @@ class jit_sigmoid_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_soft_sign_emitter : public jit_emitter { @@ -883,13 +996,14 @@ class jit_soft_sign_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_sqrt_emitter : public jit_emitter { @@ -916,23 +1030,24 @@ class jit_sqrt_emitter : public jit_emitter { class jit_subtract_emitter : public jit_emitter { public: - jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); - jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_swish_emitter : public jit_emitter { @@ -956,25 +1071,26 @@ class jit_swish_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr sigmoid_emitter; float beta; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_tanh_emitter : public jit_emitter { public: - jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -988,17 +1104,18 @@ class jit_tanh_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr sigmoid_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp index bb7783b60a1b53..4c0b0f95f783c2 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp @@ -3,9 +3,11 @@ // #include "jit_emitter.hpp" + #include -#include "utils/general_utils.h" + #include "emitters/utils.hpp" +#include "utils/general_utils.h" using namespace dnnl::impl::cpu; using namespace dnnl::impl; @@ -16,26 +18,37 @@ namespace aarch64 { const std::vector jit_emitter::store_gpr_regs = { // Parameter/result registers - 0, 1, 2, 3, 4, 5, 6, 7, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, // r8: Indirect result location register // r9...r15: Temporary registers - 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, // r19...r28: Callee-saved registers - 29, 30 -}; - -static const std::vector vec_regs = { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 -}; - -void jit_emitter::emit_code(const std::vector &in_idxs, - const std::vector &out_idxs, - const std::vector &pool_vec_idxs, - const std::vector &pool_gpr_idxs) const { + 29, + 30}; + +static const std::vector vec_regs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +void jit_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); emit_impl(in_idxs, out_idxs); @@ -52,7 +65,7 @@ void jit_emitter::emit_data() const { // Run through the map and insert values stored there for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - const auto &te = (*it).second; // get map entry for a given key + const auto& te = (*it).second; // get map entry for a given key const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val); @@ -88,7 +101,7 @@ void jit_emitter::prepare_table() { // prepare_table. size_t off = 0; for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - auto &te = (*it).second; + auto& te = (*it).second; te.off = off; off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t); } @@ -99,10 +112,10 @@ void jit_emitter::emitter_preamble(const std::vector& in_idxs, const std::vector& pool_aux_vec_idxs, const std::vector& pool_aux_gpr_idxs) const { using namespace Xbyak_aarch64::util; - const bool is_vec_input = (in_out_type_ == emitter_in_out_map::vec_to_vec) || - (in_out_type_ == emitter_in_out_map::vec_to_gpr); - const bool is_vec_output = (in_out_type_ == emitter_in_out_map::vec_to_vec) || - (in_out_type_ == emitter_in_out_map::gpr_to_vec); + const bool is_vec_input = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr); + const bool is_vec_output = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec); // vector registers for (auto idx : pool_aux_vec_idxs) { @@ -110,20 +123,27 @@ void jit_emitter::emitter_preamble(const std::vector& in_idxs, } for (size_t idx = 0; idx < get_max_vecs_count(); idx++) { - if (aux_vec_idxs.size() >= get_aux_vecs_count()) break; + if (aux_vec_idxs.size() >= get_aux_vecs_count()) + break; if (is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) + continue; } if (is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) + continue; } - if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) continue; - if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) + continue; + if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) + continue; - if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) continue; - if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) continue; + if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) + continue; + if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) + continue; aux_vec_idxs.push_back(idx); preserved_vec_idxs.push_back(idx); @@ -138,23 +158,27 @@ void jit_emitter::emitter_preamble(const std::vector& in_idxs, const uint32_t end_gpr_idx = Xbyak_aarch64::Operand::X30; for (size_t gpr_idx = 0; gpr_idx <= end_gpr_idx; ++gpr_idx) { - size_t _idx = end_gpr_idx - gpr_idx; // we allocate from the end + size_t _idx = end_gpr_idx - gpr_idx; // we allocate from the end - if (aux_gpr_idxs.size() >= get_aux_gprs_count()) break; - if ((_idx == Xbyak_aarch64::Operand::X18) || - (_idx == Xbyak_aarch64::Operand::X23) || - (_idx == Xbyak_aarch64::Operand::X24) || - (_idx == Xbyak_aarch64::Operand::X28)) continue; + if (aux_gpr_idxs.size() >= get_aux_gprs_count()) + break; + if ((_idx == Xbyak_aarch64::Operand::X18) || (_idx == Xbyak_aarch64::Operand::X23) || + (_idx == Xbyak_aarch64::Operand::X24) || (_idx == Xbyak_aarch64::Operand::X28)) + continue; if (!is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) + continue; } if (!is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) + continue; } - if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) continue; - if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) continue; + if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) + continue; + if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) + continue; aux_gpr_idxs.push_back(_idx); preserved_gpr_idxs.push_back(_idx); @@ -189,23 +213,21 @@ void jit_emitter::store_context(const std::unordered_set& ignore_registe store_context(store_gpr_regs, vec_regs, ignore_registers); } -void jit_emitter::store_context( - const std::vector& gpr_regs, - const std::vector& vec_regs, - const std::unordered_set& ignore_vec_regs) const { +void jit_emitter::store_context(const std::vector& gpr_regs, + const std::vector& vec_regs, + const std::unordered_set& ignore_vec_regs) const { // 1. General-purpose Registers // 1.1. store pair registers const auto store_gpr_regs_size = gpr_regs.size(); const auto last = store_gpr_regs_size % 2; for (size_t i = 0; i < (store_gpr_regs_size - last); i += 2) { - h->stp(Xbyak_aarch64::XReg(gpr_regs[i]), - Xbyak_aarch64::XReg(gpr_regs[i + 1]), - pre_ptr(h->sp, -get_gpr_length() * 2)); + h->stp(Xbyak_aarch64::XReg(gpr_regs[i]), + Xbyak_aarch64::XReg(gpr_regs[i + 1]), + pre_ptr(h->sp, -get_gpr_length() * 2)); } // 1.2. store the remaining register if (last != 0) { - h->str(Xbyak_aarch64::XReg(gpr_regs[store_gpr_regs_size - 1]), - pre_ptr(h->sp, -get_gpr_length())); + h->str(Xbyak_aarch64::XReg(gpr_regs[store_gpr_regs_size - 1]), pre_ptr(h->sp, -get_gpr_length())); } // 2. SIMD and Floating-Point registers @@ -221,17 +243,14 @@ void jit_emitter::store_context( prev_reg_idx = static_cast(reg_idx); continue; } - h->stp(Xbyak_aarch64::QReg(prev_reg_idx), - Xbyak_aarch64::QReg(reg_idx), - pre_ptr(h->sp, -get_vec_length() * 2)); + h->stp(Xbyak_aarch64::QReg(prev_reg_idx), Xbyak_aarch64::QReg(reg_idx), pre_ptr(h->sp, -get_vec_length() * 2)); prev_reg_idx = -1; } // 2.1. store the remaining register if (prev_reg_idx != -1) { if (ignore_vec_regs.find(prev_reg_idx) == ignore_vec_regs.end()) { - h->str(Xbyak_aarch64::QReg(prev_reg_idx), - pre_ptr(h->sp, -get_vec_length())); + h->str(Xbyak_aarch64::QReg(prev_reg_idx), pre_ptr(h->sp, -get_vec_length())); } else { ignore_registers_count++; } @@ -245,10 +264,9 @@ void jit_emitter::restore_context(const std::unordered_set& ignore_vec_r restore_context(store_gpr_regs, vec_regs, ignore_vec_regs); } -void jit_emitter::restore_context( - const std::vector& gpr_regs, - const std::vector& vec_regs, - const std::unordered_set& ignore_vec_regs) const { +void jit_emitter::restore_context(const std::vector& gpr_regs, + const std::vector& vec_regs, + const std::unordered_set& ignore_vec_regs) const { // 1. SIMD and Floating-Point registers // 1.1. restore the remaining register auto v_last = (vec_regs.size() - ignore_vec_regs.size()) % 2; @@ -260,8 +278,7 @@ void jit_emitter::restore_context( continue; } - h->ldr(Xbyak_aarch64::QReg(reg_idx), - post_ptr(h->sp, get_vec_length())); + h->ldr(Xbyak_aarch64::QReg(reg_idx), post_ptr(h->sp, get_vec_length())); break; } } @@ -278,9 +295,7 @@ void jit_emitter::restore_context( prev_reg_idx = static_cast(reg_idx); continue; } - h->ldp(Xbyak_aarch64::QReg(reg_idx), - Xbyak_aarch64::QReg(prev_reg_idx), - post_ptr(h->sp, get_vec_length() * 2)); + h->ldp(Xbyak_aarch64::QReg(reg_idx), Xbyak_aarch64::QReg(prev_reg_idx), post_ptr(h->sp, get_vec_length() * 2)); prev_reg_idx = -1; } @@ -292,8 +307,7 @@ void jit_emitter::restore_context( const auto save_gpr_regs_size = gpr_regs.size(); const auto last = save_gpr_regs_size % 2; if (last != 0) { - h->ldr(Xbyak_aarch64::XReg(gpr_regs[save_gpr_regs_size - 1]), - post_ptr(h->sp, get_gpr_length())); + h->ldr(Xbyak_aarch64::XReg(gpr_regs[save_gpr_regs_size - 1]), post_ptr(h->sp, get_gpr_length())); } // 2.2. restore pair registers @@ -304,6 +318,6 @@ void jit_emitter::restore_context( } } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp index ba9b8c2d9cbdf1..9ce8203afe7783 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp @@ -4,14 +4,13 @@ #pragma once +#include #include #include -#include -#include "snippets/snippets_isa.hpp" -#include "snippets/generator.hpp" #include "node.h" - +#include "snippets/generator.hpp" +#include "snippets/snippets_isa.hpp" namespace ov { namespace intel_cpu { @@ -29,25 +28,32 @@ class jit_emitter : public ov::snippets::Emitter { jit_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32, - emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : - Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), - in_out_type_(in_out_type), p_table(0), l_table (new Xbyak_aarch64::Label()) { - } + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) + : Emitter(), + h(host), + host_isa_(host_isa), + exec_prc_(exec_prc), + in_out_type_(in_out_type), + p_table(0), + l_table(new Xbyak_aarch64::Label()) {} jit_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32, - emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : - Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), - in_out_type_(in_out_type), p_table(0), l_table (new Xbyak_aarch64::Label()) { - } - - void emit_code( - const std::vector &in_idxs, - const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, - const std::vector &pool_gpr_idxs = {}) const override; + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) + : Emitter(), + h(host), + host_isa_(host_isa), + exec_prc_(exec_prc), + in_out_type_(in_out_type), + p_table(0), + l_table(new Xbyak_aarch64::Label()) {} + + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; void emit_data() const override; @@ -60,7 +66,8 @@ class jit_emitter : public ov::snippets::Emitter { * Precisions are ordered, the first bigger bitness precision with the same type will be selected. * Empty collection means the emitter supports any input precisions. */ - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); protected: size_t get_max_vecs_count() const; @@ -78,12 +85,14 @@ class jit_emitter : public ov::snippets::Emitter { virtual void prepare_table(); virtual void register_table_entries() {} - void load_table_addr() const { h->adr(p_table, *l_table.get()); } + void load_table_addr() const { + h->adr(p_table, *l_table.get()); + } // we accept only 32bit hexadecimal table values to avoid any rounding using table_entry_val_t = uint32_t; - using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table - using table_entry_bcast_t = bool; // true => bcast value + using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table + using table_entry_bcast_t = bool; // true => bcast value struct table_entry_t { table_entry_val_t val; @@ -98,7 +107,7 @@ class jit_emitter : public ov::snippets::Emitter { mutable Xbyak_aarch64::XReg p_table; mutable std::shared_ptr l_table; - virtual void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const = 0; + virtual void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const = 0; virtual void emitter_preamble(const std::vector& in_idxs, const std::vector& out_idxs, @@ -128,14 +137,14 @@ class jit_emitter : public ov::snippets::Emitter { } void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) { - mapped_table_entry_t te {0, val, broadcast}; + mapped_table_entry_t te{0, val, broadcast}; entry_map_.insert(std::make_pair(key, te)); } - void push_entries_of(const table_t &t) { + void push_entries_of(const table_t& t) { for (auto it = t.begin(); it != t.end(); it++) { auto key = (*it).first; - auto te = (*it).second; // copy values from table + auto te = (*it).second; // copy values from table push_arg_entry_of(key, te.val, te.bcast); } } @@ -150,9 +159,9 @@ class jit_emitter : public ov::snippets::Emitter { size_t table_off(const std::string& key, const size_t key_off_val_shift = 0) const { // assumption: all table entries sharing the same key also // share their broadcast property - const auto it = entry_map_.find(key); // search an entry for a key + const auto it = entry_map_.find(key); // search an entry for a key assert(it != entry_map_.end()); - const auto &te = (*it).second; + const auto& te = (*it).second; const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); return te.off + key_off_val_shift * scale; } @@ -176,6 +185,6 @@ class jit_emitter : public ov::snippets::Emitter { const std::unordered_set& ignore_vec_regs = {}) const; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp index ca18bc5d4b575d..3ca77bdac53baf 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_load_store_emitters.hpp" + #include "cpu/aarch64/cpu_isa_traits.hpp" #include "emitters/utils.hpp" @@ -15,14 +16,23 @@ namespace aarch64 { using jit_generator = dnnl::impl::cpu::aarch64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::aarch64::cpu_isa_t; -jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, int byte_offset, - ov::element::Type exec_prc, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), load_num_(load_num), byte_offset_(byte_offset), prc_(src_prc) { +jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + int byte_offset, + ov::element::Type exec_prc, + emitter_in_out_map in_out_type) + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + load_num_(load_num), + byte_offset_(byte_offset), + prc_(src_prc) { OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, "Unsupported precision pair."); } -void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); } else { @@ -31,7 +41,7 @@ void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std:: } template -void jit_load_emitter::load_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::load_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); @@ -39,31 +49,31 @@ void jit_load_emitter::load_qbyte(const std::vector &in_idxs, const std: DReg dst_d = DReg(out_idxs[0]); switch (load_num_) { - case 0: - break; - case 1: - h->ldr(dst_s, ptr(src, byte_offset_)); - break; - case 2: - h->ldr(dst_d, ptr(src, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->ldr(dst_d, ptr(src, byte_offset_)); - h->add_imm(prc, src, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); - h->ld1(dst.s[2], ptr(prc)); - break; - } - case 4: - h->uni_ldr(dst, src, byte_offset_); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); + case 0: + break; + case 1: + h->ldr(dst_s, ptr(src, byte_offset_)); + break; + case 2: + h->ldr(dst_d, ptr(src, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->ldr(dst_d, ptr(src, byte_offset_)); + h->add_imm(prc, src, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); + h->ld1(dst.s[2], ptr(prc)); + break; + } + case 4: + h->uni_ldr(dst, src, byte_offset_); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); } } template -void jit_load_emitter::load_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::load_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); @@ -72,31 +82,31 @@ void jit_load_emitter::load_dbyte(const std::vector &in_idxs, const std: DReg dst_d = DReg(out_idxs[0]); switch (load_num_) { - case 0: - break; - case 1: - h->ldr(dst_h, ptr(src, byte_offset_)); - break; - case 2: - h->ldr(dst_s, ptr(src, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->ldr(dst_s, ptr(src, byte_offset_)); - h->add_imm(prc, src, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); - h->ld1(dst.h[2], ptr(prc)); - break; - } - case 4: - h->ldr(dst_d, ptr(src, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); + case 0: + break; + case 1: + h->ldr(dst_h, ptr(src, byte_offset_)); + break; + case 2: + h->ldr(dst_s, ptr(src, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->ldr(dst_s, ptr(src, byte_offset_)); + h->add_imm(prc, src, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); + h->ld1(dst.h[2], ptr(prc)); + break; + } + case 4: + h->ldr(dst_d, ptr(src, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); } } template -void jit_load_emitter::load_byte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::load_byte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); @@ -105,49 +115,50 @@ void jit_load_emitter::load_byte(const std::vector &in_idxs, const std:: SReg dst_s = SReg(out_idxs[0]); switch (load_num_) { - case 0: - break; - case 1: - h->ldr(dst_b, ptr(src, byte_offset_)); - break; - case 2: - h->ldr(dst_h, ptr(src, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->ldr(dst_h, ptr(src, byte_offset_)); - h->add_imm(prc, src, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); - h->ld1(dst.b[2], ptr(prc)); - break; - } - case 4: - h->ldr(dst_s, ptr(src, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); + case 0: + break; + case 1: + h->ldr(dst_b, ptr(src, byte_offset_)); + break; + case 2: + h->ldr(dst_h, ptr(src, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->ldr(dst_h, ptr(src, byte_offset_)); + h->add_imm(prc, src, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); + h->ld1(dst.b[2], ptr(prc)); + break; + } + case 4: + h->ldr(dst_s, ptr(src, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); } } template -void jit_load_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { - OV_CPU_JIT_EMITTER_ASSERT(one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported precision."); +void jit_load_emitter::emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT( + one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported precision."); OV_CPU_JIT_EMITTER_ASSERT(load_num_ <= 4, "Unexpected number of elements to load."); switch (prc_) { - case ov::element::f32: - case ov::element::i32: - load_qbyte(in_idxs, out_idxs); - break; - case ov::element::f16: - load_dbyte(in_idxs, out_idxs); - break; - case ov::element::i8: - case ov::element::u8: - load_byte(in_idxs, out_idxs); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); + case ov::element::f32: + case ov::element::i32: + load_qbyte(in_idxs, out_idxs); + break; + case ov::element::f16: + load_dbyte(in_idxs, out_idxs); + break; + case ov::element::i8: + case ov::element::u8: + load_byte(in_idxs, out_idxs); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); } } @@ -158,14 +169,24 @@ size_t jit_load_emitter::get_aux_gprs_count() const { return 0; } -jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, int byte_offset, - arithmetic_mode mode, ov::element::Type exec_prc, emitter_in_out_map in_out_type) - : jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), store_num_(store_num), byte_offset_(byte_offset), prc_(dst_prc) { +jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, + int byte_offset, + arithmetic_mode mode, + ov::element::Type exec_prc, + emitter_in_out_map in_out_type) + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + store_num_(store_num), + byte_offset_(byte_offset), + prc_(dst_prc) { OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, "Unsupported precision pair."); } -void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); } else { @@ -174,7 +195,7 @@ void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std: } template -void jit_store_emitter::store_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::store_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); SReg src_s = SReg(in_idxs[0]); @@ -183,31 +204,31 @@ void jit_store_emitter::store_qbyte(const std::vector &in_idxs, const st XReg dst = XReg(out_idxs[0]); switch (store_num_) { - case 0: - break; - case 1: - h->str(src_s, ptr(dst, byte_offset_)); - break; - case 2: - h->str(src_d, ptr(dst, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->str(src_d, ptr(dst, byte_offset_)); - h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); - h->st1(src.s[2], ptr(prc)); - break; - } - case 4: - h->str(src_q, ptr(dst, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); + case 0: + break; + case 1: + h->str(src_s, ptr(dst, byte_offset_)); + break; + case 2: + h->str(src_d, ptr(dst, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->str(src_d, ptr(dst, byte_offset_)); + h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); + h->st1(src.s[2], ptr(prc)); + break; + } + case 4: + h->str(src_q, ptr(dst, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); } } template -void jit_store_emitter::store_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::store_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); HReg src_h = HReg(in_idxs[0]); @@ -216,31 +237,31 @@ void jit_store_emitter::store_dbyte(const std::vector &in_idxs, const st XReg dst = XReg(out_idxs[0]); switch (store_num_) { - case 0: - break; - case 1: - h->str(src_h, ptr(dst, byte_offset_)); - break; - case 2: - h->str(src_s, ptr(dst, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->str(src_s, ptr(dst, byte_offset_)); - h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); - h->st1(src.h[2], ptr(prc)); - break; - } - case 4: - h->str(src_d, ptr(dst, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); + case 0: + break; + case 1: + h->str(src_h, ptr(dst, byte_offset_)); + break; + case 2: + h->str(src_s, ptr(dst, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->str(src_s, ptr(dst, byte_offset_)); + h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); + h->st1(src.h[2], ptr(prc)); + break; + } + case 4: + h->str(src_d, ptr(dst, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); } } template -void jit_store_emitter::store_byte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::store_byte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); BReg src_b = BReg(in_idxs[0]); @@ -249,49 +270,50 @@ void jit_store_emitter::store_byte(const std::vector &in_idxs, const std XReg dst = XReg(out_idxs[0]); switch (store_num_) { - case 0: - break; - case 1: - h->str(src_b, ptr(dst, byte_offset_)); - break; - case 2: - h->str(src_h, ptr(dst, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->str(src_h, ptr(dst, byte_offset_)); - h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); - h->st1(src.b[2], ptr(prc)); - break; - } - case 4: - h->str(src_s, ptr(dst, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); + case 0: + break; + case 1: + h->str(src_b, ptr(dst, byte_offset_)); + break; + case 2: + h->str(src_h, ptr(dst, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->str(src_h, ptr(dst, byte_offset_)); + h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); + h->st1(src.b[2], ptr(prc)); + break; + } + case 4: + h->str(src_s, ptr(dst, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); } } template -void jit_store_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { - OV_CPU_JIT_EMITTER_ASSERT(one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported precision."); +void jit_store_emitter::emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT( + one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported precision."); OV_CPU_JIT_EMITTER_ASSERT(store_num_ <= 4, "Unexpected number of elements to store."); switch (prc_) { - case ov::element::f32: - case ov::element::i32: - store_qbyte(in_idxs, out_idxs); - break; - case ov::element::f16: - store_dbyte(in_idxs, out_idxs); - break; - case ov::element::i8: - case ov::element::u8: - store_byte(in_idxs, out_idxs); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); + case ov::element::f32: + case ov::element::i32: + store_qbyte(in_idxs, out_idxs); + break; + case ov::element::f16: + store_dbyte(in_idxs, out_idxs); + break; + case ov::element::i8: + case ov::element::u8: + store_byte(in_idxs, out_idxs); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); } } @@ -302,6 +324,6 @@ size_t jit_store_emitter::get_aux_gprs_count() const { return 0; } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp index 887522ed1055e1..8c0983189f3083 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp @@ -4,38 +4,41 @@ #pragma once -#include "jit_emitter.hpp" #include "cpu/aarch64/jit_generator.hpp" +#include "jit_emitter.hpp" namespace ov { namespace intel_cpu { namespace aarch64 { // Arithmetic modes for data type conversion in store_emitter -enum class arithmetic_mode { - saturation, - truncation -}; +enum class arithmetic_mode { saturation, truncation }; class jit_load_emitter : public jit_emitter { public: - jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, int byte_offset, + jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + int byte_offset, ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::gpr_to_vec); - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; - size_t get_inputs_count() const override { return 1; }; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; + size_t get_inputs_count() const override { + return 1; + }; private: template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void load_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void load_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void load_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void load_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void load_byte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void load_byte(const std::vector& in_idxs, const std::vector& out_idxs) const; size_t get_aux_gprs_count() const override; std::string name_; @@ -46,23 +49,30 @@ class jit_load_emitter : public jit_emitter { class jit_store_emitter : public jit_emitter { public: - jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, int byte_offset_, - arithmetic_mode mode = arithmetic_mode::saturation, ov::element::Type exec_prc = ov::element::f32, + jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, + int byte_offset_, + arithmetic_mode mode = arithmetic_mode::saturation, + ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr); - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; - size_t get_inputs_count() const override { return 1; } + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; + size_t get_inputs_count() const override { + return 1; + } private: template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void store_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void store_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void store_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void store_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void store_byte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void store_byte(const std::vector& in_idxs, const std::vector& out_idxs) const; size_t get_aux_gprs_count() const override; std::string name_; @@ -71,6 +81,6 @@ class jit_store_emitter : public jit_emitter { ov::element::Type prc_; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp index 01af9dbde7fe01..c2c6ddf6f271fc 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp @@ -4,9 +4,10 @@ #ifdef CPU_DEBUG_CAPS -#include "debug_capabilities.hpp" -#include -#include +# include "debug_capabilities.hpp" + +# include +# include namespace ov { namespace intel_cpu { @@ -14,25 +15,26 @@ namespace intel_cpu { using namespace Xbyak; using namespace dnnl::impl::cpu::x64; -template void RegPrinter::print(jit_generator &h, Xmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Xmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Ymm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Ymm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Zmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Zmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg64 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg64 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg32 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg32 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg16 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg16 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg8 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg8 reg, const char *name); +template void RegPrinter::print(jit_generator& h, Xmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Xmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Ymm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Ymm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Zmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Zmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg64 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg64 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg32 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg32 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg16 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg16 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg8 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg8 reg, const char* name); template -void RegPrinter::print_reg_prc(const char *name, const char *ori_name, T *ptr) { +void RegPrinter::print_reg_prc(const char* name, const char* ori_name, T* ptr) { std::stringstream ss; - if (name) ss << name << " | "; + if (name) + ss << name << " | "; ss << ori_name << ": "; if (std::is_floating_point::value) { ss << *ptr; @@ -48,9 +50,10 @@ void RegPrinter::print_reg_prc(const char *name, const char *ori_name, T *ptr) { } template -void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, PRC_T *ptr) { +void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, PRC_T* ptr) { std::stringstream ss; - if (name) ss << name << " | "; + if (name) + ss << name << " | "; ss << ori_name << ": {" << ptr[0]; for (size_t i = 1; i < vlen / sizeof(float); i++) { ss << ", " << ptr[i]; @@ -58,15 +61,15 @@ void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, PRC_T *pt ss << "}" << std::endl; std::cout << ss.str(); } -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, float *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, float *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, float *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, int *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, int *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, int *ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, float* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, float* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, float* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, int* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, int* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, int* ptr); template -struct vmm_traits{}; +struct vmm_traits {}; template <> struct vmm_traits { @@ -87,7 +90,7 @@ struct vmm_traits { }; template -void RegPrinter::save_vmm(jit_generator &h) { +void RegPrinter::save_vmm(jit_generator& h) { h.sub(h.rsp, vmm_traits::vmm_len * vmm_traits::vmm_cnt); for (size_t i = 0; i < vmm_traits::vmm_cnt; i++) { h.uni_vmovups(h.ptr[h.rsp + i * vmm_traits::vmm_len], T(i)); @@ -95,52 +98,52 @@ void RegPrinter::save_vmm(jit_generator &h) { } template -void RegPrinter::restore_vmm(jit_generator &h) { +void RegPrinter::restore_vmm(jit_generator& h) { for (size_t i = 0; i < vmm_traits::vmm_cnt; i++) { h.uni_vmovups(T(i), h.ptr[h.rsp + i * vmm_traits::vmm_len]); } h.add(h.rsp, vmm_traits::vmm_len * vmm_traits::vmm_cnt); } -void RegPrinter::save_reg(jit_generator &h) { +void RegPrinter::save_reg(jit_generator& h) { h.sub(h.rsp, reg_len * reg_cnt); for (size_t i = 0; i < reg_cnt; i++) { h.mov(h.ptr[h.rsp + i * reg_len], Reg64(i)); } } -void RegPrinter::restore_reg(jit_generator &h) { +void RegPrinter::restore_reg(jit_generator& h) { for (size_t i = 0; i < reg_cnt; i++) { h.mov(Reg64(i), h.ptr[h.rsp + i * reg_len]); } h.add(h.rsp, reg_len * reg_cnt); } -void RegPrinter::preamble(jit_generator &h) { +void RegPrinter::preamble(jit_generator& h) { save_reg(h); - mayiuse(cpu_isa_t::avx512_core) ? save_vmm(h) : (mayiuse(cpu_isa_t::avx2) ? - save_vmm(h) : save_vmm(h)); + mayiuse(cpu_isa_t::avx512_core) ? save_vmm(h) + : (mayiuse(cpu_isa_t::avx2) ? save_vmm(h) : save_vmm(h)); } -void RegPrinter::postamble(jit_generator &h) { - mayiuse(cpu_isa_t::avx512_core) ? restore_vmm(h) : (mayiuse(cpu_isa_t::avx2) ? - restore_vmm(h) : restore_vmm(h)); +void RegPrinter::postamble(jit_generator& h) { + mayiuse(cpu_isa_t::avx512_core) ? restore_vmm(h) + : (mayiuse(cpu_isa_t::avx2) ? restore_vmm(h) : restore_vmm(h)); restore_reg(h); } // ABI requires 16-bype stack alignment before a call -void RegPrinter::align_rsp(jit_generator &h) { +void RegPrinter::align_rsp(jit_generator& h) { constexpr int alignment = 16; h.mov(h.r15, h.rsp); h.and_(h.rsp, ~(alignment - 1)); } -void RegPrinter::restore_rsp(jit_generator &h) { +void RegPrinter::restore_rsp(jit_generator& h) { h.mov(h.rsp, h.r15); } template -void RegPrinter::print_vmm(jit_generator &h, REG_T vmm, const char *name) { +void RegPrinter::print_vmm(jit_generator& h, REG_T vmm, const char* name) { preamble(h); h.push(h.rax); @@ -181,7 +184,7 @@ void RegPrinter::print_vmm(jit_generator &h, REG_T vmm, const char *name) { } template -void RegPrinter::print_reg(jit_generator &h, REG_T reg, const char *name) { +void RegPrinter::print_reg(jit_generator& h, REG_T reg, const char* name) { preamble(h); h.push(h.rax); @@ -213,8 +216,7 @@ void RegPrinter::print_reg(jit_generator &h, REG_T reg, const char *name) { postamble(h); } -} // namespace intel_cpu -} // namespace ov - +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp index fd7135b17bf5b9..dcac847dfd1e0f 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp @@ -6,7 +6,7 @@ #ifdef CPU_DEBUG_CAPS -#include "cpu/x64/jit_generator.hpp" +# include "cpu/x64/jit_generator.hpp" namespace ov { namespace intel_cpu { @@ -56,42 +56,44 @@ namespace intel_cpu { class RegPrinter { public: using jit_generator = dnnl::impl::cpu::x64::jit_generator; - template ::value, int>::type = 0> - static void print(jit_generator &h, REG_T reg, const char *name = nullptr) { + template ::value, int>::type = 0> + static void print(jit_generator& h, REG_T reg, const char* name = nullptr) { print_vmm(h, reg, name); } - template ::value, int>::type = 0> - static void print(jit_generator &h, REG_T reg, const char *name = nullptr) { + template ::value, int>::type = 0> + static void print(jit_generator& h, REG_T reg, const char* name = nullptr) { print_reg(h, reg, name); } private: RegPrinter() {} template - static void print_vmm(jit_generator &h, REG_T vmm, const char *name); + static void print_vmm(jit_generator& h, REG_T vmm, const char* name); template - static void print_reg(jit_generator &h, REG_T reg, const char *name); + static void print_reg(jit_generator& h, REG_T reg, const char* name); template - static void print_vmm_prc(const char *name, const char *ori_name, PRC_T *ptr); + static void print_vmm_prc(const char* name, const char* ori_name, PRC_T* ptr); template - static void print_reg_prc(const char *name, const char *ori_name, T *val); - static void preamble(jit_generator &h); - static void postamble(jit_generator &h); + static void print_reg_prc(const char* name, const char* ori_name, T* val); + static void preamble(jit_generator& h); + static void postamble(jit_generator& h); template - static void save_vmm(jit_generator &h); + static void save_vmm(jit_generator& h); template - static void restore_vmm(jit_generator &h); - static void save_reg(jit_generator &h); - static void restore_reg(jit_generator &h); - static void align_rsp(jit_generator &h); - static void restore_rsp(jit_generator &h); + static void restore_vmm(jit_generator& h); + static void save_reg(jit_generator& h); + static void restore_reg(jit_generator& h); + static void align_rsp(jit_generator& h); + static void restore_rsp(jit_generator& h); static constexpr size_t reg_len = 8; static constexpr size_t reg_cnt = 16; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp index 43a2c2eb6b045f..2bfbaa68880aa8 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp @@ -11,14 +11,18 @@ namespace intel_cpu { class jit_uni_vcvtneps2bf16 : public jit_emitter { public: - jit_uni_vcvtneps2bf16(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::bf16) : jit_emitter(host, host_isa, exec_prc) { + jit_uni_vcvtneps2bf16(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::bf16) + : jit_emitter(host, host_isa, exec_prc) { if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16) && !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) prepare_table(); } - size_t get_inputs_num() const override { return 1; } + size_t get_inputs_num() const override { + return 1; + } private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override { @@ -36,7 +40,8 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { template void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using namespace Xbyak; - using Vmm = typename dnnl::impl::utils::conditional3::type; + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Vmm in = Vmm(in_vec_idxs[0]); @@ -79,7 +84,7 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { h->uni_vpackusdw(aux, aux, aux); if (host_isa_ == dnnl::impl::cpu::x64::cpu_isa_t::avx2) { - h->vpermq(Ymm(aux.getIdx()), Ymm(aux.getIdx()), 0xD8); //11 01 10 00 + h->vpermq(Ymm(aux.getIdx()), Ymm(aux.getIdx()), 0xD8); // 11 01 10 00 h->vextracti128(out, Ymm(aux.getIdx()), 0); } else { h->uni_vmovups(out, aux); @@ -123,5 +128,5 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp index 544960008c9158..2e90af39fb9cf1 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp @@ -6,7 +6,6 @@ #include "utils/bfloat16.hpp" - using namespace dnnl::impl::utils; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -15,19 +14,23 @@ using namespace Xbyak; namespace ov { namespace intel_cpu { -jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_convert_emitter::jit_convert_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { input_type = node->get_input_element_type(0); output_type = node->get_output_element_type(0); if (output_type == ov::element::bf16) - uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(host, host_isa)); + uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(host, host_isa)); } void jit_convert_emitter::validate_types() const { auto is_supported_type = [this](const ov::element::Type& type) { - return any_of(supported_types.begin(), supported_types.end(), - [&type](const ov::element::Type& supported_type) { return supported_type == type; } ); + return any_of(supported_types.begin(), supported_types.end(), [&type](const ov::element::Type& supported_type) { + return supported_type == type; + }); }; if (!is_supported_type(input_type)) @@ -36,7 +39,9 @@ void jit_convert_emitter::validate_types() const { OV_CPU_JIT_EMITTER_THROW("Unsupported output type: ", output_type.get_type_name()); } -size_t jit_convert_emitter::get_inputs_num() const { return 1; } +size_t jit_convert_emitter::get_inputs_num() const { + return 1; +} void jit_convert_emitter::emit_data() const { jit_emitter::emit_data(); @@ -45,19 +50,22 @@ void jit_convert_emitter::emit_data() const { } template -void jit_convert_emitter::float2bfloat(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_emitter::float2bfloat(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); if (!uni_vcvtneps2bf16) OV_CPU_JIT_EMITTER_THROW("Converter from float to bf16 isn't initialized!"); uni_vcvtneps2bf16->emit_code({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); } -jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { +jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) { prepare_table(); } @@ -66,7 +74,8 @@ bool jit_convert_truncation_emitter::is_i8_and_u8_case() const { one_of(output_type, ov::element::i8, ov::element::u8); } -void jit_convert_truncation_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_truncation_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { validate_types(); if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); @@ -80,10 +89,11 @@ void jit_convert_truncation_emitter::emit_impl(const std::vector &in_vec } template -void jit_convert_truncation_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_truncation_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); Xmm xmm_dst = Xmm(out_vec_idxs[0]); Ymm ymm_dst = Ymm(out_vec_idxs[0]); @@ -97,95 +107,95 @@ void jit_convert_truncation_emitter::emit_isa(const std::vector &in_vec_ } switch (input_type) { - case ov::element::f32: - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_src); - break; - case ov::element::i32: - if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) - h->uni_vcvtdq2ps(vmm_dst, vmm_src); - break; - case ov::element::bf16: - h->vpmovzxwd(vmm_dst, vmm_src); - h->uni_vpslld(vmm_dst, vmm_dst, 16); - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::f16: - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); - else - h->vcvtph2ps(vmm_dst, - Xmm(vmm_src.getIdx())); // for avx2_vnni_2? - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::i8: - h->uni_vpmovsxbd(vmm_dst, vmm_src); - break; - case ov::element::u8: - h->uni_vpmovzxbd(vmm_dst, vmm_src); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); + case ov::element::f32: + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_src); + break; + case ov::element::i32: + if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) + h->uni_vcvtdq2ps(vmm_dst, vmm_src); + break; + case ov::element::bf16: + h->vpmovzxwd(vmm_dst, vmm_src); + h->uni_vpslld(vmm_dst, vmm_dst, 16); + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::f16: + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); + else + h->vcvtph2ps(vmm_dst, + Xmm(vmm_src.getIdx())); // for avx2_vnni_2? + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i8: + h->uni_vpmovsxbd(vmm_dst, vmm_src); + break; + case ov::element::u8: + h->uni_vpmovzxbd(vmm_dst, vmm_src); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); } switch (output_type) { - case ov::element::f32: - if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + case ov::element::f32: + if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + break; + case ov::element::i32: + break; + case ov::element::bf16: + if (input_type == ov::element::f32) { + float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - case ov::element::i32: - break; - case ov::element::bf16: - if (input_type == ov::element::f32) { - float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } - break; - case ov::element::f16: - if (input_type == ov::element::f32) { - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_src, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_src, 0x4); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); - } - break; - case ov::element::i8: - case ov::element::u8: - if (input_type == ov::element::i32) { - dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } else { - dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + case ov::element::f16: + if (input_type == ov::element::f32) { + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_src, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_src, 0x4); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); + } + break; + case ov::element::i8: + case ov::element::u8: + if (input_type == ov::element::i32) { + dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); } } void jit_convert_truncation_emitter::register_table_entries() { - if (host_isa_ == dnnl::impl::cpu::x64::avx2 && - one_of(output_type, ov::element::i8, ov::element::u8) && + if (host_isa_ == dnnl::impl::cpu::x64::avx2 && one_of(output_type, ov::element::i8, ov::element::u8) && !is_i8_and_u8_case()) push_arg_entry_of("mask_byte", 0x000000ff, true); } template -void jit_convert_truncation_emitter::dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_truncation_emitter::dword2int8(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); @@ -204,12 +214,14 @@ void jit_convert_truncation_emitter::dword2int8(const std::vector &in_ve } } -jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { -} +jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) {} -void jit_convert_saturation_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_saturation_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { validate_types(); if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); @@ -223,10 +235,11 @@ void jit_convert_saturation_emitter::emit_impl(const std::vector &in_vec } template -void jit_convert_saturation_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_saturation_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); Xmm xmm_dst = Xmm(out_vec_idxs[0]); Ymm ymm_dst = Ymm(out_vec_idxs[0]); @@ -237,88 +250,94 @@ void jit_convert_saturation_emitter::emit_isa(const std::vector &in_vec_ } switch (input_type) { - case ov::element::f32: - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvtps2dq(vmm_dst, vmm_src); - break; - case ov::element::i32: - if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) - h->uni_vcvtdq2ps(vmm_dst, vmm_src); - break; - case ov::element::bf16: - h->vpmovzxwd(vmm_dst, vmm_src); - h->uni_vpslld(vmm_dst, vmm_dst, 16); - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::f16: - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); - else - h->vcvtph2ps(vmm_dst, - Xmm(vmm_src.getIdx())); // for avx2_vnni_2? - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::i8: - h->uni_vpmovsxbd(vmm_dst, vmm_src); - break; - case ov::element::u8: - h->uni_vpmovzxbd(vmm_dst, vmm_src); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); + case ov::element::f32: + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvtps2dq(vmm_dst, vmm_src); + break; + case ov::element::i32: + if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) + h->uni_vcvtdq2ps(vmm_dst, vmm_src); + break; + case ov::element::bf16: + h->vpmovzxwd(vmm_dst, vmm_src); + h->uni_vpslld(vmm_dst, vmm_dst, 16); + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::f16: + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); + else + h->vcvtph2ps(vmm_dst, + Xmm(vmm_src.getIdx())); // for avx2_vnni_2? + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i8: + h->uni_vpmovsxbd(vmm_dst, vmm_src); + break; + case ov::element::u8: + h->uni_vpmovzxbd(vmm_dst, vmm_src); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); } switch (output_type) { - case ov::element::f32: - if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + case ov::element::f32: + if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + break; + case ov::element::i32: + break; + case ov::element::bf16: + if (input_type == ov::element::f32) { + float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - case ov::element::i32: - break; - case ov::element::bf16: - if (input_type == ov::element::f32) { - float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } - break; - case ov::element::f16: - if (input_type == ov::element::f32) { - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_src, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_src, 0x4); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); - } - break; - case ov::element::i8: - case ov::element::u8: - if (input_type == ov::element::i32) { - dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}, output_type.is_signed()); - } else { - dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}, output_type.is_signed()); + float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + case ov::element::f16: + if (input_type == ov::element::f32) { + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_src, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_src, 0x4); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); + } + break; + case ov::element::i8: + case ov::element::u8: + if (input_type == ov::element::i32) { + dword2int8({static_cast(vmm_src.getIdx())}, + {static_cast(vmm_dst.getIdx())}, + output_type.is_signed()); + } else { + dword2int8({static_cast(vmm_dst.getIdx())}, + {static_cast(vmm_dst.getIdx())}, + output_type.is_signed()); + } + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); } } template -void jit_convert_saturation_emitter::dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, bool is_signed) const { +void jit_convert_saturation_emitter::dword2int8(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + bool is_signed) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); @@ -330,7 +349,7 @@ void jit_convert_saturation_emitter::dword2int8(const std::vector &in_ve if (is_signed) { h->vpmovsdb(xmm_dst, vmm_src); } else { - Vmm vmm_zero = Vmm(aux_vec_idxs[0]); + Vmm vmm_zero = Vmm(aux_vec_idxs[0]); h->vpxord(vmm_zero, vmm_zero, vmm_zero); h->vpmaxsd(vmm_dst, vmm_src, vmm_zero); h->vpmovusdb(xmm_dst, vmm_dst); @@ -353,8 +372,8 @@ void jit_convert_saturation_emitter::dword2int8(const std::vector &in_ve size_t jit_convert_saturation_emitter::aux_vecs_count() const { // 1 register is for dword2int8 unsigned - return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core? 1 : 0; + return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core ? 1 : 0; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp index ee451ed358dd1a..29b85079573bee 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp @@ -4,16 +4,18 @@ #pragma once -#include "jit_emitter.hpp" #include "jit_bf16_emitters.hpp" +#include "jit_emitter.hpp" namespace ov { namespace intel_cpu { class jit_convert_emitter : public jit_emitter { public: - jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; @@ -22,19 +24,13 @@ class jit_convert_emitter : public jit_emitter { void validate_types() const; template - void float2bfloat(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void float2bfloat(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; ov::element::Type input_type; ov::element::Type output_type; - const ov::element::TypeVector supported_types = { - ov::element::f32, - ov::element::i32, - ov::element::bf16, - ov::element::f16, - ov::element::i8, - ov::element::u8 - }; + const ov::element::TypeVector supported_types = + {ov::element::f32, ov::element::i32, ov::element::bf16, ov::element::f16, ov::element::i8, ov::element::u8}; std::shared_ptr uni_vcvtneps2bf16 = nullptr; }; @@ -45,16 +41,18 @@ class jit_convert_emitter : public jit_emitter { // 129 -> -127 class jit_convert_truncation_emitter : public jit_convert_emitter { public: - jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; template - void dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void dword2int8(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; bool is_i8_and_u8_case() const; void register_table_entries() override; @@ -66,19 +64,23 @@ class jit_convert_truncation_emitter : public jit_convert_emitter { // 129 -> 127 class jit_convert_saturation_emitter : public jit_convert_emitter { public: - jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; template - void dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, bool is_signed) const; + void dword2int8(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + bool is_signed) const; size_t aux_vecs_count() const override; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp index 0b315cdd309715..51e801208b927c 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_dnnl_emitters.hpp" + #include using namespace dnnl::impl::utils; @@ -17,9 +18,11 @@ std::set> jit_dnnl_emitter::get_supported_precisions( return {{element::f32}}; } -jit_dnnl_emitter::jit_dnnl_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) +jit_dnnl_emitter::jit_dnnl_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { - kind = dnnl_eltwise_tanh; alpha = 0.f; beta = 0.f; @@ -27,33 +30,42 @@ jit_dnnl_emitter::jit_dnnl_emitter(jit_generator *host, cpu_isa_t host_isa, cons set_injector(); } -jit_dnnl_emitter::jit_dnnl_emitter(jit_generator *host, cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float alpha, float beta, +jit_dnnl_emitter::jit_dnnl_emitter(jit_generator* host, + cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float alpha, + float beta, ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), kind(algKind), alpha(alpha), beta(beta) { - + : jit_emitter(host, host_isa, exec_prc), + kind(algKind), + alpha(alpha), + beta(beta) { set_injector(); } void jit_dnnl_emitter::set_injector() { if (host_isa_ == cpu::x64::sse41) { - eltwise_injector_sse42 = std::make_shared>( - h, kind, alpha, beta, 1.f); + eltwise_injector_sse42 = + std::make_shared>(h, kind, alpha, beta, 1.f); } else if (host_isa_ == cpu::x64::avx2) { - eltwise_injector_avx2 = std::make_shared>( - h, kind, alpha, beta, 1.f); + eltwise_injector_avx2 = + std::make_shared>(h, kind, alpha, beta, 1.f); } else if (host_isa_ == cpu::x64::avx512_core) { - eltwise_injector_avx512_core = std::make_shared>( - h, kind, alpha, beta, 1.f); + eltwise_injector_avx512_core = + std::make_shared>(h, kind, alpha, beta, 1.f); } else { OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_); } } -size_t jit_dnnl_emitter::get_inputs_num() const { return 1; } +size_t jit_dnnl_emitter::get_inputs_num() const { + return 1; +} -void jit_dnnl_emitter::emit_code(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_dnnl_emitter::emit_code(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { if (host_isa_ == cpu::x64::sse41) { if (out_vec_idxs[0] != in_vec_idxs[0]) h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0])); @@ -83,11 +95,13 @@ void jit_dnnl_emitter::emit_data() const { } } -jit_dnnl_aux_emitter::jit_dnnl_aux_emitter(jit_generator *host, cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float inpAlpha, float inpBeta, +jit_dnnl_aux_emitter::jit_dnnl_aux_emitter(jit_generator* host, + cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float inpAlpha, + float inpBeta, ov::element::Type exec_prc) - : jit_dnnl_emitter(host, host_isa, algKind, inpAlpha, inpBeta, exec_prc) { -} + : jit_dnnl_emitter(host, host_isa, algKind, inpAlpha, inpBeta, exec_prc) {} -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp index bdf04108370ed5..22e003ad261555 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp @@ -4,8 +4,8 @@ #pragma once -#include "cpu/x64/jit_generator.hpp" #include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" +#include "cpu/x64/jit_generator.hpp" #include "jit_emitter.hpp" namespace ov { @@ -13,30 +13,41 @@ namespace intel_cpu { class jit_dnnl_emitter : public jit_emitter { public: - void emit_code(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const override; + void emit_code(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; void emit_data() const override; - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override {}; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override{}; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); protected: - jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float inpAlpha, float inpBeta, - ov::element::Type exec_prc = ov::element::f32); - jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float inpAlpha, + float inpBeta, + ov::element::Type exec_prc = ov::element::f32); + jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); void set_injector(); - dnnl_alg_kind_t kind {dnnl_alg_kind_undef}; - float alpha {0.f}; - float beta {0.f}; + dnnl_alg_kind_t kind{dnnl_alg_kind_undef}; + float alpha{0.f}; + float beta{0.f}; - std::shared_ptr> eltwise_injector_sse42; - std::shared_ptr> eltwise_injector_avx2; - std::shared_ptr> eltwise_injector_avx512_core; + std::shared_ptr> + eltwise_injector_sse42; + std::shared_ptr> + eltwise_injector_avx2; + std::shared_ptr> + eltwise_injector_avx512_core; private: size_t get_inputs_num() const override; @@ -44,12 +55,15 @@ class jit_dnnl_emitter : public jit_emitter { class jit_dnnl_aux_emitter : public jit_dnnl_emitter { public: - jit_dnnl_aux_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float inpAlpha, float inpBeta, - ov::element::Type exec_prc = ov::element::f32); + jit_dnnl_aux_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float inpAlpha, + float inpBeta, + ov::element::Type exec_prc = ov::element::f32); private: }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp index 7a4d1e31277e3b..0b7396b6fcd830 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp @@ -4,9 +4,9 @@ #pragma once +#include "jit_dnnl_emitters.hpp" #include "openvino/opsets/opset5.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" -#include "jit_dnnl_emitters.hpp" #include "utils/ngraph_utils.hpp" namespace ov { @@ -14,88 +14,102 @@ namespace intel_cpu { class jit_relu_emitter : public jit_dnnl_emitter { public: - jit_relu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_relu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_relu; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_relu; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_sigmoid_emitter : public jit_dnnl_emitter { public: - jit_sigmoid_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_sigmoid_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_logistic; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_logistic; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_tanh_emitter : public jit_dnnl_emitter { public: - jit_tanh_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_tanh_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_tanh; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_tanh; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_elu_emitter : public jit_dnnl_emitter { public: - jit_elu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_elu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_elu; - alpha = ov::as_type_ptr(n)->get_alpha(); - beta = 0.f; + kind = dnnl_eltwise_elu; + alpha = ov::as_type_ptr(n)->get_alpha(); + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_abs_emitter : public jit_dnnl_emitter { public: - jit_abs_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_abs_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_abs; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_abs; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_clamp_emitter : public jit_dnnl_emitter { public: - jit_clamp_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_clamp_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_clip; - auto op = ov::as_type_ptr(n); - alpha = op->get_min(); - beta = op->get_max(); + kind = dnnl_eltwise_clip; + auto op = ov::as_type_ptr(n); + alpha = op->get_min(); + beta = op->get_max(); - set_injector(); - } + set_injector(); + } }; class jit_swish_emitter : public jit_dnnl_emitter { public: - jit_swish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + jit_swish_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { kind = dnnl_eltwise_swish; auto op = ov::as_type_ptr(n); alpha = op->get_alpha(); @@ -107,9 +121,11 @@ class jit_swish_emitter : public jit_dnnl_emitter { class jit_hswish_emitter : public jit_dnnl_emitter { public: - jit_hswish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + jit_hswish_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { // since v3.0 oneDNN has flexible version of hardswish, ov still uses the one with hardcoded alpha and beta kind = dnnl_eltwise_hardswish; alpha = 1.f / 6.f; @@ -121,9 +137,11 @@ class jit_hswish_emitter : public jit_dnnl_emitter { class jit_gelu_v0_emitter : public jit_dnnl_emitter { public: - jit_gelu_v0_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_gelu_v0_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { kind = dnnl_eltwise_gelu_erf; set_injector(); @@ -132,9 +150,11 @@ class jit_gelu_v0_emitter : public jit_dnnl_emitter { class jit_gelu_v7_emitter : public jit_dnnl_emitter { public: - jit_gelu_v7_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_gelu_v7_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { auto gelu = getNgraphOpAs(n); ov::op::GeluApproximationMode approximationMode = gelu->get_approximation_mode(); if (approximationMode == ov::op::GeluApproximationMode::ERF) @@ -152,11 +172,11 @@ class jit_gelu_v7_emitter : public jit_dnnl_emitter { class jit_round_emitter : public jit_dnnl_emitter { public: - jit_round_emitter( - dnnl::impl::cpu::x64::jit_generator *host, - dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + jit_round_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { const auto round = getNgraphOpAs(n); const auto mode = round->get_mode(); if ((mode != ov::opset5::Round::RoundMode::HALF_AWAY_FROM_ZERO) && @@ -165,12 +185,11 @@ class jit_round_emitter : public jit_dnnl_emitter { static_cast(mode)); } - kind = mode == ov::opset5::Round::RoundMode::HALF_AWAY_FROM_ZERO ? - dnnl_eltwise_round_half_away_from_zero : - dnnl_eltwise_round_half_to_even; + kind = mode == ov::opset5::Round::RoundMode::HALF_AWAY_FROM_ZERO ? dnnl_eltwise_round_half_away_from_zero + : dnnl_eltwise_round_half_to_even; set_injector(); } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index 0331a3ee4908b9..7a091fc946c2d8 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -8,8 +8,8 @@ using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu; using namespace Xbyak; -#define CONST_1_F 0x3f800000 // 1.f -#define INF_MASK 0x7F800000 +#define CONST_1_F 0x3f800000 // 1.f +#define INF_MASK 0x7F800000 #define INF_NEG_MASK 0xFF800000 namespace ov { @@ -22,23 +22,30 @@ ov::element::Type get_arithmetic_binary_exec_precision(const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_add_emitter::jit_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_add_emitter::jit_add_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_add_emitter::jit_add_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_add_emitter::get_inputs_num() const { return 2; } +size_t jit_add_emitter::get_inputs_num() const { + return 2; +} -void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -51,7 +58,7 @@ void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -59,9 +66,14 @@ void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std auto uni_vadd = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -78,14 +90,19 @@ std::set> jit_add_emitter::get_supported_precisions(c } /// MUL_ADD /// -jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mul_add_emitter::get_inputs_num() const { return 3; } +size_t jit_mul_add_emitter::get_inputs_num() const { + return 3; +} -void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -98,7 +115,8 @@ void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -109,47 +127,49 @@ void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vfmadd231_xmm = [this](Xmm vmm_dst, Xmm vmm_src0, Xmm vmm_src1, Xmm vmm_src2) { h->uni_vmovups(vmm_dst, vmm_src0); switch (exec_prc_) { - case ov::element::f32: { - h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1); - h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2); - } break; - case ov::element::i32: { - h->uni_vpmulld(vmm_dst, vmm_dst, vmm_src1); - h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); - } break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2); + } break; + case ov::element::i32: { + h->uni_vpmulld(vmm_dst, vmm_dst, vmm_src1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); + } break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; auto uni_vfmadd231_vmm = [this, vmm_aux0](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1, Vmm vmm_src2) { switch (exec_prc_) { - case ov::element::f32: { - Vmm vmm_mul0; - if (vmm_dst.getIdx() == vmm_src0.getIdx()) { - h->uni_vmovups(vmm_aux0, vmm_src0); - vmm_mul0 = vmm_aux0; - } else { - vmm_mul0 = vmm_src0; - } - - Vmm vmm_mul1; - if (vmm_dst.getIdx() == vmm_src1.getIdx()) { - h->uni_vmovups(vmm_aux0, vmm_src1); - vmm_mul1 = vmm_aux0; - } else { - vmm_mul1 = vmm_src1; - } - - if (vmm_dst.getIdx() != vmm_src2.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src2); - - h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1); - } break; - case ov::element::i32: { - h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); - h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); - } break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + Vmm vmm_mul0; + if (vmm_dst.getIdx() == vmm_src0.getIdx()) { + h->uni_vmovups(vmm_aux0, vmm_src0); + vmm_mul0 = vmm_aux0; + } else { + vmm_mul0 = vmm_src0; + } + + Vmm vmm_mul1; + if (vmm_dst.getIdx() == vmm_src1.getIdx()) { + h->uni_vmovups(vmm_aux0, vmm_src1); + vmm_mul1 = vmm_aux0; + } else { + vmm_mul1 = vmm_src1; + } + + if (vmm_dst.getIdx() != vmm_src2.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src2); + + h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1); + } break; + case ov::element::i32: { + h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); + } break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -164,19 +184,27 @@ size_t jit_mul_add_emitter::aux_vecs_count() const { return 1; } -std::set> jit_mul_add_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_mul_add_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}, {element::i32, element::i32, element::i32}}; } /// SUB /// -jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_subtract_emitter::get_inputs_num() const { return 2; } +size_t jit_subtract_emitter::get_inputs_num() const { + return 2; +} -void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -189,7 +217,8 @@ void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -197,9 +226,14 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons auto uni_vsub = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -211,19 +245,27 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons } } -std::set> jit_subtract_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_subtract_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// MULTIPLY /// -jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_multiply_emitter::get_inputs_num() const { return 2; } +size_t jit_multiply_emitter::get_inputs_num() const { + return 2; +} -void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -236,7 +278,8 @@ void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -244,9 +287,14 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons auto uni_vmul = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -258,19 +306,26 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons } } -std::set> jit_multiply_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_multiply_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// DIVIDE /// -jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_divide_emitter::jit_divide_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_divide_emitter::jit_divide_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_divide_emitter::get_inputs_num() const { return 2; } +size_t jit_divide_emitter::get_inputs_num() const { + return 2; +} -void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -283,7 +338,8 @@ void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -291,23 +347,24 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vdiv = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: { - h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1); - break; - } - case ov::element::i32: { - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - - // The opset doesn't contain vector instruction for integer divide operation - // As WA we emulate its behavior via fp divide followed by rounding to zero - h->uni_vcvtdq2ps(vmm_dst, vmm_src0); - h->uni_vcvtdq2ps(vmm_aux0, vmm_src1); - h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0); - h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero - h->uni_vcvtps2dq(vmm_dst, vmm_dst); - break; - } - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1); + break; + } + case ov::element::i32: { + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + + // The opset doesn't contain vector instruction for integer divide operation + // As WA we emulate its behavior via fp divide followed by rounding to zero + h->uni_vcvtdq2ps(vmm_dst, vmm_src0); + h->uni_vcvtdq2ps(vmm_aux0, vmm_src1); + h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero + h->uni_vcvtps2dq(vmm_dst, vmm_dst); + break; + } + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -319,7 +376,8 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set> jit_divide_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_divide_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } @@ -328,18 +386,25 @@ size_t jit_divide_emitter::aux_vecs_count() const { } /// FLOOR /// -jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_floor_emitter::jit_floor_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_floor_emitter::jit_floor_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_floor_emitter::get_inputs_num() const { return 1; } +size_t jit_floor_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_floor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -352,7 +417,8 @@ void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -360,14 +426,20 @@ void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const s } /// CEILING /// -jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_ceiling_emitter::get_inputs_num() const { return 1; } +size_t jit_ceiling_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_ceiling_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_ceiling_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -385,7 +457,8 @@ void jit_ceiling_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_ceiling_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -403,13 +476,17 @@ jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_floor_mod_emitter::get_inputs_num() const { return 2; } +size_t jit_floor_mod_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_mod_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -422,7 +499,8 @@ void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_mod_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -434,14 +512,14 @@ void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, con h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vmovups(vmm_aux0, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down + h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } else { if (vmm_dst.getIdx() != vmm_src0.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down + h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } @@ -452,12 +530,17 @@ size_t jit_floor_mod_emitter::aux_vecs_count() const { } /// MOD /// -jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_mod_emitter::jit_mod_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_mod_emitter::jit_mod_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mod_emitter::get_inputs_num() const { return 2; } +size_t jit_mod_emitter::get_inputs_num() const { + return 2; +} std::set> jit_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; @@ -476,7 +559,7 @@ void jit_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const st } template -void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mod_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -488,14 +571,14 @@ void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vmovups(vmm_aux0, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate + h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } else { if (vmm_dst.getIdx() != vmm_src0.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate + h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } @@ -506,14 +589,19 @@ size_t jit_mod_emitter::aux_vecs_count() const { } /// MAXIMUM /// -jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_maximum_emitter::get_inputs_num() const { return 2; } +size_t jit_maximum_emitter::get_inputs_num() const { + return 2; +} -void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -526,7 +614,8 @@ void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -534,9 +623,14 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vmax = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -549,19 +643,25 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set> jit_maximum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_maximum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// MINIMUM /// -jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_minimum_emitter::get_inputs_num() const { return 2; } +size_t jit_minimum_emitter::get_inputs_num() const { + return 2; +} -void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -574,7 +674,8 @@ void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -582,9 +683,14 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vmin = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -597,20 +703,28 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set> jit_minimum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_minimum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// SQUARED_DIFFERENCE /// -jit_squared_difference_emitter::jit_squared_difference_emitter( - x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_squared_difference_emitter::get_inputs_num() const { return 2; } +size_t jit_squared_difference_emitter::get_inputs_num() const { + return 2; +} -void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_squared_difference_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -623,7 +737,8 @@ void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec } template -void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_squared_difference_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -631,15 +746,16 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ auto uni_vsqdiff = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: { - h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); - h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); - } break; - case ov::element::i32: { - h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); - h->uni_vpmulld(vmm_dst, vmm_dst, vmm_dst); - } break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); + h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); + } break; + case ov::element::i32: { + h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); + h->uni_vpmulld(vmm_dst, vmm_dst, vmm_dst); + } break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -652,24 +768,33 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ } } -std::set> jit_squared_difference_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_squared_difference_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// POWER_DYNAMIC /// -jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, +jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_power_dynamic_emitter::get_inputs_num() const { return 2; } +size_t jit_power_dynamic_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_power_dynamic_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_dynamic_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_power_dynamic_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_power_dynamic_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -682,7 +807,8 @@ void jit_power_dynamic_emitter::emit_impl(const std::vector& in_vec_idxs } template -void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_power_dynamic_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -692,8 +818,8 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, // caller obligation to save gprs as callee may use them size_t gpr_size = 8; - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, - h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + Xbyak::Operand gprs_to_save[] = + {h->r8, h->r9, h->r10, h->r11, h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -722,8 +848,8 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); - h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src - h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta + h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta // save function address in gpr to pass in in call instruction h->mov(h->rbp, reinterpret_cast(powf)); @@ -735,7 +861,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, // Take src, apply powf on it and replace value on a stack with dst. for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) { - const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; + const Address& source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; h->uni_vmovss(xmm0, source); h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]); h->call(h->rbp); @@ -767,24 +893,30 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, h->add(h->rsp, n_gprs_to_save * gpr_size); } - /// EQUAL /// -jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_equal_emitter::jit_equal_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_equal_emitter::jit_equal_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -797,7 +929,8 @@ void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -846,13 +979,17 @@ jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_not_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_not_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_not_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_not_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -865,7 +1002,8 @@ void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_not_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -900,22 +1038,29 @@ size_t jit_not_equal_emitter::aux_vecs_count() const { } /// GREATER /// -jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_greater_emitter::jit_greater_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_greater_emitter::jit_greater_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_greater_emitter::get_inputs_num() const { return 2; } +size_t jit_greater_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_greater_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -928,7 +1073,8 @@ void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, cons } template -void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -963,23 +1109,31 @@ size_t jit_greater_emitter::aux_vecs_count() const { } /// GREATER_EQUAL /// -jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, +jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_greater_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_greater_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_greater_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -992,7 +1146,8 @@ void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs } template -void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1027,22 +1182,28 @@ size_t jit_greater_equal_emitter::aux_vecs_count() const { } /// LESS /// -jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_less_emitter::jit_less_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_less_emitter::jit_less_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_less_emitter::get_inputs_num() const { return 2; } +size_t jit_less_emitter::get_inputs_num() const { + return 2; +} std::set> jit_less_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1055,7 +1216,7 @@ void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, const s } template -void jit_less_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1104,13 +1265,17 @@ jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_less_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_less_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_less_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_less_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1123,7 +1288,8 @@ void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, c } template -void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1173,13 +1339,17 @@ jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_and_emitter::get_inputs_num() const { return 2; } +size_t jit_logical_and_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_logical_and_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_and_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1192,7 +1362,8 @@ void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_and_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1261,13 +1432,17 @@ jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_or_emitter::get_inputs_num() const { return 2; } +size_t jit_logical_or_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_logical_or_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_or_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1280,7 +1455,8 @@ void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, c } template -void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_or_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1349,13 +1525,17 @@ jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_xor_emitter::get_inputs_num() const { return 2; } +size_t jit_logical_xor_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_logical_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_xor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1368,7 +1548,8 @@ void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_xor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1437,13 +1618,17 @@ jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_not_emitter::get_inputs_num() const { return 1; } +size_t jit_logical_not_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_logical_not_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_not_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1456,7 +1641,8 @@ void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_logical_not_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_not_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1507,20 +1693,30 @@ jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator* host, prepare_table(); } -jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, - float inpPower, float inpScale, float inpShift, +jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + float inpPower, + float inpScale, + float inpShift, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc), power(inpPower), scale(inpScale), shift(inpShift) { + : jit_emitter(host, host_isa, exec_prc), + power(inpPower), + scale(inpScale), + shift(inpShift) { prepare_table(); } -size_t jit_power_static_emitter::get_inputs_num() const { return 1; } +size_t jit_power_static_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_power_static_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_static_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1533,7 +1729,8 @@ void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_power_static_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1600,8 +1797,8 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, // caller obligation to save gprs as callee may use them size_t gpr_size = 8; - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, - h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + Xbyak::Operand gprs_to_save[] = + {h->r8, h->r9, h->r10, h->r11, h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -1630,8 +1827,8 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); - h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src - h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta + h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta // save function address in gpr to pass in in call instruction h->mov(h->rbp, reinterpret_cast(powf)); @@ -1643,7 +1840,7 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, // Take src, apply powf on it and replace value on a stack with dst. for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) { - const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; + const Address& source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; h->uni_vmovss(xmm0, source); h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]); h->call(h->rbp); @@ -1680,7 +1877,7 @@ void jit_power_static_emitter::register_table_entries() { push_arg_entry_of("power", x64::float2int(power), true); push_arg_entry_of("scale", x64::float2int(scale), true); push_arg_entry_of("shift", x64::float2int(shift), true); - push_arg_entry_of("one", x64::float2int(1.f), true); + push_arg_entry_of("one", x64::float2int(1.f), true); } size_t jit_power_static_emitter::aux_vecs_count() const { @@ -1699,13 +1896,17 @@ jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator* host, x64::cpu_isa_t ho : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_prelu_emitter::get_inputs_num() const { return 2; } +size_t jit_prelu_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_prelu_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_prelu_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1718,7 +1919,8 @@ void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_prelu_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1761,13 +1963,16 @@ jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator* host, jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_sqrt_emitter::get_inputs_num() const { return 1; } +size_t jit_sqrt_emitter::get_inputs_num() const { + return 1; +} std::set> jit_sqrt_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1780,12 +1985,12 @@ void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, const s } template -void jit_sqrt_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - h->uni_vsqrtps(vmm_dst, vmm_src0); + h->uni_vsqrtps(vmm_dst, vmm_src0); } /// Negate /// @@ -1795,13 +2000,17 @@ jit_negative_emitter::jit_negative_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_negative_emitter::get_inputs_num() const { return 1; } +size_t jit_negative_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_negative_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_negative_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_negative_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_negative_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1814,33 +2023,38 @@ void jit_negative_emitter::emit_impl(const std::vector& in_vec_idxs, con } template -void jit_negative_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_negative_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); h->uni_vpxor(vmm_dst, vmm_dst, vmm_dst); h->uni_vsubps(vmm_dst, vmm_dst, vmm_src); } - /// EXP /// jit_exp_emitter::jit_exp_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_exp_emitter::jit_exp_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) +jit_exp_emitter::jit_exp_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_exp_emitter::get_inputs_num() const { return 1; } +size_t jit_exp_emitter::get_inputs_num() const { + return 1; +} std::set> jit_exp_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1853,7 +2067,7 @@ void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1862,7 +2076,7 @@ void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std Vmm vmm_aux0 = Vmm(aux_vec_idxs[0 + static_cast(need_vmm_mask())]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1 + static_cast(need_vmm_mask())]); - auto compute_cmp_mask = [&](const Vmm &vmm_src, const Xbyak::Operand &compare_operand, int cmp_predicate) { + auto compute_cmp_mask = [&](const Vmm& vmm_src, const Xbyak::Operand& compare_operand, int cmp_predicate) { if (host_isa_ == x64::avx512_core) { h->vcmpps(k_mask, vmm_src, compare_operand, cmp_predicate); } else { @@ -1870,7 +2084,7 @@ void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std } }; - auto blend_with_mask = [&](const Vmm &vmm_dst, const Xbyak::Operand &src) { + auto blend_with_mask = [&](const Vmm& vmm_dst, const Xbyak::Operand& src) { if (host_isa_ == x64::avx512_core) { h->vblendmps(vmm_dst | k_mask, vmm_dst, src); } else { @@ -1924,11 +2138,11 @@ void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std } void jit_exp_emitter::register_table_entries() { - push_arg_entry_of("pol1", 0x3f7ffffb, true); // p1 = 0.999999701f - push_arg_entry_of("pol2", 0x3efffee3, true); // p2 = 0.499991506f - push_arg_entry_of("pol3", 0x3e2aad40, true); // p3 = 0.166676521f - push_arg_entry_of("pol4", 0x3d2b9d0d, true); // p4 = 0.0418978221f - push_arg_entry_of("pol5", 0x3c07cfce, true); // p5 = 0.00828929059f + push_arg_entry_of("pol1", 0x3f7ffffb, true); // p1 = 0.999999701f + push_arg_entry_of("pol2", 0x3efffee3, true); // p2 = 0.499991506f + push_arg_entry_of("pol3", 0x3e2aad40, true); // p3 = 0.166676521f + push_arg_entry_of("pol4", 0x3d2b9d0d, true); // p4 = 0.0418978221f + push_arg_entry_of("pol5", 0x3c07cfce, true); // p5 = 0.00828929059f push_arg_entry_of("one", CONST_1_F, true); push_arg_entry_of("half", 0x3f000000, true); @@ -1950,16 +2164,21 @@ jit_erf_emitter::jit_erf_emitter(x64::jit_generator* host, x64::cpu_isa_t host_i prepare_table(); } -jit_erf_emitter::jit_erf_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) +jit_erf_emitter::jit_erf_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) : jit_erf_emitter(host, host_isa, exec_prc) {} -size_t jit_erf_emitter::get_inputs_num() const { return 1; } +size_t jit_erf_emitter::get_inputs_num() const { + return 1; +} std::set> jit_erf_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_erf_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_erf_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1972,7 +2191,7 @@ void jit_erf_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_erf_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1991,8 +2210,11 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std // pass the current `aux_vec_idxs` to `exp_emitter` excepting `vmm_aux3` auto exp_aux_vec_idxs = aux_vec_idxs; - exp_aux_vec_idxs.erase(std::find(exp_aux_vec_idxs.begin(), exp_aux_vec_idxs.end(), static_cast(vmm_aux3.getIdx()))); - m_exp_emitter->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}, exp_aux_vec_idxs); + exp_aux_vec_idxs.erase( + std::find(exp_aux_vec_idxs.begin(), exp_aux_vec_idxs.end(), static_cast(vmm_aux3.getIdx()))); + m_exp_emitter->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(vmm_dst.getIdx())}, + exp_aux_vec_idxs); h->uni_vxorps(vmm_dst, vmm_dst, table_val("sign_mask")); @@ -2027,16 +2249,16 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std } void jit_erf_emitter::register_table_entries() { - push_arg_entry_of("approx_const", 0x3ea7ba05, true); // 0.3275911 + push_arg_entry_of("approx_const", 0x3ea7ba05, true); // 0.3275911 push_arg_entry_of("one", CONST_1_F, true); push_arg_entry_of("sign_mask", 0x80000000, true); push_arg_entry_of("positive_mask", 0x7fffffff, true); - push_arg_entry_of("pol1", 0x3e827906, true); // p1 = 0.254829592f - push_arg_entry_of("pol2", 0xbe91a98e, true); // p2 = -0.284496736f - push_arg_entry_of("pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f - push_arg_entry_of("pol4", 0xbfba00e3, true); // p4 = -1.453152027f - push_arg_entry_of("pol5", 0x3f87dc22, true); // p5 = 1.061405429f + push_arg_entry_of("pol1", 0x3e827906, true); // p1 = 0.254829592f + push_arg_entry_of("pol2", 0xbe91a98e, true); // p2 = -0.284496736f + push_arg_entry_of("pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f + push_arg_entry_of("pol4", 0xbfba00e3, true); // p4 = -1.453152027f + push_arg_entry_of("pol5", 0x3f87dc22, true); // p5 = 1.061405429f } size_t jit_erf_emitter::aux_vecs_count() const { @@ -2063,13 +2285,17 @@ jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_soft_sign_emitter::get_inputs_num() const { return 1; } +size_t jit_soft_sign_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_soft_sign_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_soft_sign_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2082,7 +2308,8 @@ void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_soft_sign_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_soft_sign_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -2100,10 +2327,11 @@ void jit_soft_sign_emitter::register_table_entries() { /// IS_FINITE /// template <> -void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { auto vmm_src = Zmm(in_vec_idxs[0]); auto vmm_dst = Zmm(out_vec_idxs[0]); - auto &ones_mask = h->k1; + auto& ones_mask = h->k1; auto reg32_one = Reg32(aux_gpr_idxs[0]); h->mov(reg32_one, CONST_1_F); @@ -2113,13 +2341,14 @@ void jit_is_finite_emitter::emit_isa(const std::vector } template -void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional::type; auto vmm_src = Vmm(in_vec_idxs[0]); auto vmm_dst = Vmm(out_vec_idxs[0]); h->uni_vandps(vmm_src, vmm_src, table_val("inf")); - h->uni_vcmpps(vmm_src, vmm_src, table_val("inf"), 0B00000100); // NEq + h->uni_vcmpps(vmm_src, vmm_src, table_val("inf"), 0B00000100); // NEq if (isa == x64::avx2) { h->uni_vandps(vmm_dst, vmm_src, table_val("one")); @@ -2131,7 +2360,8 @@ void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, con } } -void jit_is_finite_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2152,12 +2382,13 @@ void jit_is_finite_emitter::register_table_entries() { /// IS_INF /// template <> -void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_inf_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { Zmm vmm_src = Zmm(in_vec_idxs[0]); Zmm vmm_dst = Zmm(out_vec_idxs[0]); if (detect_negative || detect_positive) { - auto &ones_mask = h->k1; + auto& ones_mask = h->k1; auto reg32_one = Reg32(aux_gpr_idxs[0]); uint8_t imm = detect_negative ? 0B00010000 : 0B00000000; if (detect_positive) { @@ -2173,7 +2404,8 @@ void jit_is_inf_emitter::emit_isa(const std::vector &i } template -void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_inf_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional::type; if (detect_negative || detect_positive) { @@ -2204,7 +2436,8 @@ void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -void jit_is_inf_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_inf_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2226,10 +2459,11 @@ void jit_is_inf_emitter::register_table_entries() { /// IS_NAN /// template <> -void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { auto vmm_src = Zmm(in_vec_idxs[0]); auto vmm_dst = Zmm(out_vec_idxs[0]); - auto &ones_mask = h->k1; + auto& ones_mask = h->k1; auto reg32_one = Reg32(aux_gpr_idxs[0]); h->mov(reg32_one, CONST_1_F); @@ -2238,7 +2472,8 @@ void jit_is_nan_emitter::emit_isa(const std::vector &i } template -void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional::type; auto vmm_src = Vmm(in_vec_idxs[0]); auto vmm_dst = Vmm(out_vec_idxs[0]); @@ -2254,7 +2489,8 @@ void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -void jit_is_nan_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2281,9 +2517,12 @@ jit_select_emitter::jit_select_emitter(x64::jit_generator* host, jit_select_emitter::jit_select_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_select_emitter::get_inputs_num() const { return 3; } +size_t jit_select_emitter::get_inputs_num() const { + return 3; +} -std::set> jit_select_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_select_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}}; } @@ -2292,11 +2531,12 @@ size_t jit_select_emitter::aux_vecs_count() const { return 0; else if (host_isa_ == x64::avx2) // tmp vec for mask return 1; - else // mask should be xmm0 on sse41 + tmp vec for mask + else // mask should be xmm0 on sse41 + tmp vec for mask return 2; } -void jit_select_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2309,7 +2549,8 @@ void jit_select_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_select_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_cond = Vmm(in_vec_idxs[0]); Vmm vmm_src0 = Vmm(in_vec_idxs[1]); @@ -2346,20 +2587,22 @@ jit_bitwise_and_emitter::jit_bitwise_and_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_bitwise_and_emitter::jit_bitwise_and_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_and_emitter::jit_bitwise_and_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_bitwise_and_emitter::get_inputs_num() const { return 2; } +size_t jit_bitwise_and_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_bitwise_and_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8, element::i8}, - {element::u8, element::u8}, - {element::i32, element::i32} - }; +std::set> jit_bitwise_and_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8, element::i8}, {element::u8, element::u8}, {element::i32, element::i32}}; } -void jit_bitwise_and_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_and_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2372,7 +2615,8 @@ void jit_bitwise_and_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_bitwise_and_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_and_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -2399,24 +2643,28 @@ jit_bitwise_not_emitter::jit_bitwise_not_emitter(x64::jit_generator* host, prepare_table(); } -jit_bitwise_not_emitter::jit_bitwise_not_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_not_emitter::jit_bitwise_not_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_bitwise_not_emitter::get_inputs_num() const { return 1; } +size_t jit_bitwise_not_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_bitwise_not_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8}, - {element::u8}, - {element::i32} - }; +std::set> jit_bitwise_not_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8}, {element::u8}, {element::i32}}; } -size_t jit_bitwise_not_emitter::aux_vecs_count() const { return 1; } +size_t jit_bitwise_not_emitter::aux_vecs_count() const { + return 1; +} -void jit_bitwise_not_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_not_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2429,7 +2677,8 @@ void jit_bitwise_not_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_bitwise_not_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_not_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -2457,20 +2706,22 @@ jit_bitwise_or_emitter::jit_bitwise_or_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_bitwise_or_emitter::jit_bitwise_or_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_or_emitter::jit_bitwise_or_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_bitwise_or_emitter::get_inputs_num() const { return 2; } +size_t jit_bitwise_or_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_bitwise_or_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8, element::i8}, - {element::u8, element::u8}, - {element::i32, element::i32} - }; +std::set> jit_bitwise_or_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8, element::i8}, {element::u8, element::u8}, {element::i32, element::i32}}; } -void jit_bitwise_or_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_or_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2483,7 +2734,8 @@ void jit_bitwise_or_emitter::emit_impl(const std::vector& in_vec_idxs, c } template -void jit_bitwise_or_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_or_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -2508,20 +2760,22 @@ jit_bitwise_xor_emitter::jit_bitwise_xor_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_bitwise_xor_emitter::jit_bitwise_xor_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_xor_emitter::jit_bitwise_xor_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_bitwise_xor_emitter::get_inputs_num() const { return 2; } +size_t jit_bitwise_xor_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_bitwise_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8, element::i8}, - {element::u8, element::u8}, - {element::i32, element::i32} - }; +std::set> jit_bitwise_xor_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8, element::i8}, {element::u8, element::u8}, {element::i32, element::i32}}; } -void jit_bitwise_xor_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_xor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2534,7 +2788,8 @@ void jit_bitwise_xor_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_bitwise_xor_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_xor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -2543,5 +2798,5 @@ void jit_bitwise_xor_emitter::emit_isa(const std::vector& in_vec_idxs, c h->uni_vxorps(vmm_dst, vmm_src0, vmm_src1); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp index c8c4b06d6f3347..84c65d44a12280 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp @@ -11,418 +11,488 @@ namespace intel_cpu { class jit_add_emitter : public jit_emitter { public: - jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_mul_add_emitter : public jit_emitter { public: - jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; - class jit_subtract_emitter : public jit_emitter { public: - jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_multiply_emitter : public jit_emitter { public: - jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_divide_emitter : public jit_emitter { public: - jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; class jit_floor_emitter : public jit_emitter { public: - jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_ceiling_emitter : public jit_emitter { public: - jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_floor_mod_emitter : public jit_emitter { public: - jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; - class jit_mod_emitter : public jit_emitter { public: - jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; - class jit_maximum_emitter : public jit_emitter { public: - jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_minimum_emitter : public jit_emitter { public: - jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_squared_difference_emitter : public jit_emitter { public: - jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_power_dynamic_emitter : public jit_emitter { public: - jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_equal_emitter : public jit_emitter { public: - jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_not_equal_emitter : public jit_emitter { public: - jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_greater_emitter : public jit_emitter { public: - jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_greater_equal_emitter : public jit_emitter { public: - jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_less_emitter : public jit_emitter { public: - jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_less_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_less_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_less_equal_emitter : public jit_emitter { public: - jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_logical_and_emitter : public jit_emitter { public: - jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_logical_or_emitter : public jit_emitter { public: - jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_logical_xor_emitter : public jit_emitter { public: - jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; @@ -430,19 +500,23 @@ class jit_logical_xor_emitter : public jit_emitter { class jit_logical_not_emitter : public jit_emitter { public: - jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; @@ -450,21 +524,26 @@ class jit_logical_not_emitter : public jit_emitter { class jit_power_static_emitter : public jit_emitter { public: - jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - float inpPower, float inpScale, float inpShift, + jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + float inpPower, + float inpScale, + float inpShift, ov::element::Type exec_prc = ov::element::f32); - jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); - + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; @@ -476,73 +555,90 @@ class jit_power_static_emitter : public jit_emitter { class jit_prelu_emitter : public jit_emitter { public: - jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; class jit_sqrt_emitter : public jit_emitter { public: - jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_negative_emitter : public jit_emitter { public: - jit_negative_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_negative_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_exp_emitter : public jit_emitter { public: - jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; - bool need_vmm_mask() const { return host_isa_ != dnnl::impl::cpu::x64::avx512_core; } + bool need_vmm_mask() const { + return host_isa_ != dnnl::impl::cpu::x64::avx512_core; + } void register_table_entries() override; size_t aux_vecs_count() const override; @@ -550,103 +646,132 @@ class jit_exp_emitter : public jit_emitter { class jit_erf_emitter : public jit_emitter { public: - jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); + jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); - jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); void emit_data() const override; size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl( - const std::vector &in_vec_idxs, - const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; - std::unique_ptr m_exp_emitter {nullptr}; + std::unique_ptr m_exp_emitter{nullptr}; }; class jit_soft_sign_emitter : public jit_emitter { public: - jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_is_finite_emitter : public jit_emitter { public: - jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + const std::shared_ptr& node, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - size_t get_inputs_num() const override { return 1; }; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + }; + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + size_t aux_gprs_count() const override { + return (entry_map_.empty() ? 0 : 1) + 1; + } void register_table_entries() override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_is_inf_emitter : public jit_emitter { public: - jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, - ov::element::Type execPrc = ov::element::f32, bool detect_negative = true, bool detect_positive = true) - : jit_emitter(host, hostIsa, execPrc), detect_negative(detect_negative), detect_positive(detect_positive) { + jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + ov::element::Type execPrc = ov::element::f32, + bool detect_negative = true, + bool detect_positive = true) + : jit_emitter(host, hostIsa, execPrc), + detect_negative(detect_negative), + detect_positive(detect_positive) { prepare_table(); } - jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, - ov::element::Type execPrc = ov::element::f32): jit_emitter(host, hostIsa, execPrc) { + jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + const std::shared_ptr& node, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - size_t get_inputs_num() const override { return 1; }; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + }; + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + size_t aux_gprs_count() const override { + return (entry_map_.empty() ? 0 : 1) + 1; + } void register_table_entries() override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; bool detect_negative; bool detect_positive; @@ -654,58 +779,76 @@ class jit_is_inf_emitter : public jit_emitter { class jit_is_nan_emitter : public jit_emitter { public: - jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + const std::shared_ptr& node, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - size_t get_inputs_num() const override { return 1; } - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + size_t aux_gprs_count() const override { + return (entry_map_.empty() ? 0 : 1) + 1; + } void register_table_entries() override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_select_emitter : public jit_emitter { public: - jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_select_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_select_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); size_t aux_vecs_count() const override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_bitwise_and_emitter : public jit_emitter { public: - jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; @@ -716,13 +859,17 @@ class jit_bitwise_and_emitter : public jit_emitter { class jit_bitwise_not_emitter : public jit_emitter { public: - jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); size_t aux_vecs_count() const override; private: @@ -735,13 +882,17 @@ class jit_bitwise_not_emitter : public jit_emitter { class jit_bitwise_or_emitter : public jit_emitter { public: - jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; @@ -752,13 +903,17 @@ class jit_bitwise_or_emitter : public jit_emitter { class jit_bitwise_xor_emitter : public jit_emitter { public: - jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; @@ -767,5 +922,5 @@ class jit_bitwise_xor_emitter : public jit_emitter { void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp index acbb04ea01af80..7ee4d5184b311a 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp @@ -3,9 +3,11 @@ // #include "jit_emitter.hpp" + #include -#include "utils/general_utils.h" + #include "utils.hpp" +#include "utils/general_utils.h" using namespace dnnl::impl::cpu; using namespace dnnl::impl; @@ -19,11 +21,12 @@ size_t jit_emitter::get_max_vecs_count() const { } size_t jit_emitter::get_vec_length() const { - return one_of(host_isa_, cpu::x64::avx512_core, cpu::x64::avx512_core) ? 64 : - one_of(host_isa_, cpu::x64::avx2) ? 32 : 16; + return one_of(host_isa_, cpu::x64::avx512_core, cpu::x64::avx512_core) ? 64 + : one_of(host_isa_, cpu::x64::avx2) ? 32 + : 16; } -void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const { +void jit_emitter::push_vec(const Xbyak::Address& addr, size_t vec_idx) const { if (host_isa_ == cpu::x64::sse41) { h->uni_vmovups(addr, Xmm(vec_idx)); } else if (host_isa_ == cpu::x64::avx2) { @@ -33,7 +36,7 @@ void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const { } } -void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address &addr) const { +void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address& addr) const { if (host_isa_ == cpu::x64::sse41) { h->uni_vmovups(Xmm(vec_idx), addr); } else if (host_isa_ == cpu::x64::avx2) { @@ -60,11 +63,15 @@ std::set> jit_emitter::get_supported_precisions(const return {}; } -void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_emitter::emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { using namespace Xbyak::util; - bool is_vec_input = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr); - bool is_vec_output = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec); + bool is_vec_input = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr); + bool is_vec_output = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec); for (auto idx : pool_vec_idxs) aux_vec_idxs.push_back(idx); @@ -73,9 +80,11 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std if (host_isa_ == cpu::x64::sse41 && aux_vecs_count() > 0) { size_t idx = 0; if (is_vec_input) - OV_CPU_JIT_EMITTER_ASSERT(std::find(in_idxs.begin(), in_idxs.end(), idx) == in_idxs.end(), "Xmm(0) cannot be input register in SSE41"); + OV_CPU_JIT_EMITTER_ASSERT(std::find(in_idxs.begin(), in_idxs.end(), idx) == in_idxs.end(), + "Xmm(0) cannot be input register in SSE41"); if (is_vec_output) - OV_CPU_JIT_EMITTER_ASSERT(std::find(out_idxs.begin(), out_idxs.end(), idx) == out_idxs.end(), "Xmm(0) cannot be output register in SSE41"); + OV_CPU_JIT_EMITTER_ASSERT(std::find(out_idxs.begin(), out_idxs.end(), idx) == out_idxs.end(), + "Xmm(0) cannot be output register in SSE41"); if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) == aux_vec_idxs.end()) { aux_vec_idxs.push_back(idx); preserved_vec_idxs.push_back(idx); @@ -93,16 +102,21 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std } for (size_t idx = 0; idx < get_max_vecs_count(); idx++) { - if (aux_vec_idxs.size() >= aux_vecs_count()) break; + if (aux_vec_idxs.size() >= aux_vecs_count()) + break; if (is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) + continue; } if (is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) + continue; } - if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) continue; - if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) continue; + if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) + continue; + if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) + continue; aux_vec_idxs.push_back(idx); preserved_vec_idxs.push_back(idx); @@ -115,18 +129,24 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std aux_gpr_idxs.push_back(idx); for (size_t gpr_idx = 0; gpr_idx <= Operand::R15; ++gpr_idx) { - size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end + size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end - if (aux_gpr_idxs.size() >= aux_gprs_count()) break; - if (_idx == Operand::RSP) continue; + if (aux_gpr_idxs.size() >= aux_gprs_count()) + break; + if (_idx == Operand::RSP) + continue; if (!is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) + continue; } if (!is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) + continue; } - if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) continue; - if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) continue; + if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) + continue; + if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) + continue; aux_gpr_idxs.push_back(_idx); preserved_gpr_idxs.push_back(_idx); @@ -154,7 +174,6 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std load_table_addr(); } - void jit_emitter::emitter_postamble() const { using namespace Xbyak::util; @@ -183,7 +202,7 @@ void jit_emitter::emit_data() const { // Run through the map and insert values stored there for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - const auto &te = (*it).second; // get map entry for a given key + const auto& te = (*it).second; // get map entry for a given key const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val); @@ -199,14 +218,16 @@ void jit_emitter::prepare_table() { // prepare_table. size_t off = 0; for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - auto &te = (*it).second; + auto& te = (*it).second; te.off = off; off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t); } } -void jit_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); emit_impl(in_idxs, out_idxs); @@ -214,5 +235,5 @@ void jit_emitter::emit_code(const std::vector &in_idxs, const std::vecto emitter_postamble(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp index c5729613f1bfe5..04ac2e6ea0684d 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp @@ -4,17 +4,17 @@ #pragma once -#include "cpu/x64/jit_generator.hpp" - -#include "snippets/snippets_isa.hpp" -#include "snippets/generator.hpp" -#include "emitters/utils.hpp" #include #include +#include "cpu/x64/jit_generator.hpp" +#include "emitters/utils.hpp" +#include "snippets/generator.hpp" +#include "snippets/snippets_isa.hpp" + #ifdef SNIPPETS_DEBUG_CAPS -#include "emitters/snippets/x64/verbose.hpp" +# include "emitters/snippets/x64/verbose.hpp" #endif namespace ov { @@ -34,14 +34,23 @@ struct emitter_params { class jit_emitter : public ov::snippets::Emitter { public: - jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) - : Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), l_table (new Xbyak::Label()), in_out_type_(in_out_type) { - k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well + jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32, + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) + : Emitter(), + h(host), + host_isa_(host_isa), + exec_prc_(exec_prc), + l_table(new Xbyak::Label()), + in_out_type_(in_out_type) { + k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; void emit_data() const override; virtual size_t get_inputs_num() const = 0; @@ -53,10 +62,11 @@ class jit_emitter : public ov::snippets::Emitter { * Precisions are ordered, the first bigger bitness precision with the same type will be selected. * Empty collection means the emitter supports any input precisions. */ - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); #ifdef SNIPPETS_DEBUG_CAPS - const char *info() const { + const char* info() const { if (!info_.is_initialized()) info_.init(this); return info_.c_str(); @@ -77,12 +87,14 @@ class jit_emitter : public ov::snippets::Emitter { virtual void prepare_table(); virtual void register_table_entries() {} - void load_table_addr() const { h->mov(p_table, *l_table.get()); } + void load_table_addr() const { + h->mov(p_table, *l_table.get()); + } // we accept only 32bit hexadecimal table values to avoid any rounding using table_entry_val_t = uint32_t; - using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table - using table_entry_bcast_t = bool; // true => bcast value + using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table + using table_entry_bcast_t = bool; // true => bcast value struct table_entry_t { table_entry_val_t val; @@ -106,10 +118,12 @@ class jit_emitter : public ov::snippets::Emitter { _cmp_gt_os = dnnl::impl::cpu::x64::jit_generator::_cmp_nle_us, }; - virtual void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const = 0; + virtual void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const = 0; - virtual void emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const; + virtual void emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const; virtual void emitter_postamble() const; emitter_in_out_map in_out_type_; @@ -132,14 +146,14 @@ class jit_emitter : public ov::snippets::Emitter { mapped_table_t entry_map_; void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) { - mapped_table_entry_t te {0, val, broadcast}; + mapped_table_entry_t te{0, val, broadcast}; entry_map_.insert(std::make_pair(key, te)); } - void push_entries_of(const table_t &t) { + void push_entries_of(const table_t& t) { for (auto it = t.begin(); it != t.end(); it++) { auto key = (*it).first; - auto te = (*it).second; // copy values from table + auto te = (*it).second; // copy values from table push_arg_entry_of(key, te.val, te.bcast); } } @@ -155,20 +169,20 @@ class jit_emitter : public ov::snippets::Emitter { mutable std::vector preserved_vec_idxs; mutable std::vector preserved_gpr_idxs; - void push_vec(const Xbyak::Address &addr, size_t vec_idx) const; - void pop_vec(size_t vec_idx, const Xbyak::Address &addr) const; + void push_vec(const Xbyak::Address& addr, size_t vec_idx) const; + void pop_vec(size_t vec_idx, const Xbyak::Address& addr) const; size_t table_off(std::string& key, size_t key_off_val_shift = 0) const { // assumption: all table entries sharing the same key also // share their broadcast property // TODO: enforce through data structure - const auto it = entry_map_.find(key); // search an entry for a key + const auto it = entry_map_.find(key); // search an entry for a key OV_CPU_JIT_EMITTER_ASSERT(it != entry_map_.end(), "Value has not been found in the table"); - const auto &te = (*it).second; + const auto& te = (*it).second; const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); return te.off + key_off_val_shift * scale; } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index 893c18768a9511..513c1f70d22932 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_load_store_emitters.hpp" + #include "utils/bfloat16.hpp" using namespace dnnl::impl; @@ -16,19 +17,20 @@ using namespace Xbyak::util; // An auxiliary vector reg(data_reg_new) is used as destination vector for source pollution instructions, // After updated, processed with new vector and no more need to update as source is preserved. // e.g. with STORE_KEEP_SOURCE(vextractf128, xmm, Xmm(aux_src_idx), ymm, 1); -// if ymm is already updated, h->vextractf128(xmm, ymm, 1) is used, which change ymm values as xmm and ymm have the same index. -// if ymm is not updated, h->vextractf128(Xmm(aux_src_idx), ymm, 1) is used, which keep ymm values unchanged as destination is another vector reg. +// if ymm is already updated, h->vextractf128(xmm, ymm, 1) is used, which change ymm values as xmm and ymm have the +// same index. if ymm is not updated, h->vextractf128(Xmm(aux_src_idx), ymm, 1) is used, which keep ymm values +// unchanged as destination is another vector reg. #define STORE_KEEP_SOURCE(instruction, data_reg, data_reg_new, ...) \ - if (data_reg_updated) { \ - h->instruction(data_reg, __VA_ARGS__); \ - } else { \ - h->instruction(data_reg_new, __VA_ARGS__); \ - data_idx = aux_src_idx; \ - xmm = Xbyak::Xmm(data_idx); \ - ymm = Xbyak::Ymm(data_idx); \ - zmm = Xbyak::Zmm(data_idx); \ - vmm = Vmm(data_idx); \ - data_reg_updated = true; \ + if (data_reg_updated) { \ + h->instruction(data_reg, __VA_ARGS__); \ + } else { \ + h->instruction(data_reg_new, __VA_ARGS__); \ + data_idx = aux_src_idx; \ + xmm = Xbyak::Xmm(data_idx); \ + ymm = Xbyak::Ymm(data_idx); \ + zmm = Xbyak::Zmm(data_idx); \ + vmm = Vmm(data_idx); \ + data_reg_updated = true; \ } namespace ov { @@ -39,7 +41,7 @@ namespace { constexpr int threshold_for_mask_emu_load = 14; // heuristic threshold number by byte between mask store and emulation with several simple partial store constexpr int threshold_for_mask_emu_store = 6; -} // namespace +} // namespace size_t load_emitter_params::hash() const { size_t seed = 0; @@ -61,46 +63,69 @@ size_t store_emitter_params::hash() const { return seed; } -static int get_aux_regs_as_temp(const int elem_count, const int data_size, bool is_pure_move, bool is_store_as_real16, - const int avx512_threshold_for_mask = 0, const bool is_fill = false) { +static int get_aux_regs_as_temp(const int elem_count, + const int data_size, + bool is_pure_move, + bool is_store_as_real16, + const int avx512_threshold_for_mask = 0, + const bool is_fill = false) { if (mayiuse(cpu::x64::avx512_core) && is_fill) return 1; // for pure move, there are direct no-mask instructions to move on full xmm/ymm/zmm, so aux_gpr is not needed. // for move+convert: - // there are direct no-mask instructions to load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm as f32/i32, so aux_gpr is not needed. - // there are direct no-mask instructions to store i32 on full xmm/ymm/zmm to i8/u8/i16/u16, so aux_gpr is not needed. - // store f32 on full xmm/ymm/zmm to bf16/fp16, need convert to bf16/fp16 on vmm, then store vmm to memory, use store_dword_to_word/byte_base condition. - // store_num == 16, vector: 16 * f32 -> 16 * bf16 -> ymm(256bit) -> store - // store_num == 8, vector: 8 * f32 -> 8 * bf16 -> xmm(128bit) -> store - // store_num == 4, vector: 4 * f32 -> 4 * bf16 -> 64bit -> masked instruction with aux_gpr needed - // f32<->i32 is on full vmm, so aux_gpr is not needed. + // there are direct no-mask instructions to load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm as f32/i32, so aux_gpr + // is not needed. there are direct no-mask instructions to store i32 on full xmm/ymm/zmm to i8/u8/i16/u16, so + // aux_gpr is not needed. store f32 on full xmm/ymm/zmm to bf16/fp16, need convert to bf16/fp16 on vmm, then store + // vmm to memory, use store_dword_to_word/byte_base condition. store_num == 16, vector: 16 * f32 -> 16 * bf16 -> + // ymm(256bit) -> store store_num == 8, vector: 8 * f32 -> 8 * bf16 -> xmm(128bit) -> store store_num == 4, + // vector: 4 * f32 -> 4 * bf16 -> 64bit -> masked instruction with aux_gpr needed f32<->i32 is on full vmm, + // so aux_gpr is not needed. const int byte_size = elem_count * data_size; - if ((is_pure_move && one_of(byte_size, 16, 32, 64)) || (!is_pure_move && one_of(elem_count, 4, 8, 16) && !is_store_as_real16)) + if ((is_pure_move && one_of(byte_size, 16, 32, 64)) || + (!is_pure_move && one_of(elem_count, 4, 8, 16) && !is_store_as_real16)) return 0; - if ((mayiuse(cpu::x64::avx512_core) && (byte_size > avx512_threshold_for_mask)) || (one_of(byte_size % 16, 1, 2, 3))) + if ((mayiuse(cpu::x64::avx512_core) && (byte_size > avx512_threshold_for_mask)) || + (one_of(byte_size % 16, 1, 2, 3))) return 1; return 0; } /// LOAD /// -jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, ov::element::Type exec_prc, - bool is_fill, std::string fill_value, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), load_num_(load_num), src_prc_(src_prc), - dst_prc_(dst_prc), is_fill_(is_fill), fill_value_(fill_value) { +jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + ov::element::Type exec_prc, + bool is_fill, + std::string fill_value, + emitter_in_out_map in_out_type) + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + load_num_(load_num), + src_prc_(src_prc), + dst_prc_(dst_prc), + is_fill_(is_fill), + fill_value_(fill_value) { prepare_table(); load_size_ = load_num * src_prc.size(); v_len_elt_ = get_vec_length() / exec_prc.size(); } -size_t jit_load_emitter::get_inputs_num() const { return 1; } +size_t jit_load_emitter::get_inputs_num() const { + return 1; +} size_t jit_load_emitter::aux_gprs_count() const { // 0 for temp reg for mask load in avx512 if needed - const auto is_pure_load = (src_prc_ == dst_prc_) || - (one_of(src_prc_, ov::element::f32, ov::element::i32) && - one_of(dst_prc_, ov::element::f32, ov::element::i32)); - int count = get_aux_regs_as_temp(load_num_, static_cast(src_prc_.size()), is_pure_load, false, threshold_for_mask_emu_load, is_fill_); + const auto is_pure_load = (src_prc_ == dst_prc_) || (one_of(src_prc_, ov::element::f32, ov::element::i32) && + one_of(dst_prc_, ov::element::f32, ov::element::i32)); + int count = get_aux_regs_as_temp(load_num_, + static_cast(src_prc_.size()), + is_pure_load, + false, + threshold_for_mask_emu_load, + is_fill_); // 1 for table address if (is_fill_) @@ -109,7 +134,7 @@ size_t jit_load_emitter::aux_gprs_count() const { return count; } -void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { // offset in load emitter is the offset of src gpr register, should be parsed from in_idxs. const int offset = in_idxs.size() == 2 ? in_idxs[1] : 0; if (host_isa_ == cpu::x64::sse41) { @@ -124,7 +149,7 @@ void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std:: } template -void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_idx, const int offset) const { +void jit_load_emitter::emit_isa(const Xbyak::Reg64& reg_src, const int out_vec_idx, const int offset) const { bool matched_prc = (dst_prc_ == src_prc_) || (dst_prc_ == ov::element::f32) || (dst_prc_ == ov::element::i32); if (!matched_prc) { OV_CPU_JIT_EMITTER_THROW("only support output precision of FP32 or I32 or the same precision as input."); @@ -139,43 +164,43 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_i if (src_prc_ == dst_prc_) { load_bytes(Vmm(out_vec_idx), reg_src, offset, load_size_); } else { - // "pure load" + convert. dst_prc must be FP32 or I32. + // "pure load" + convert. dst_prc must be FP32 or I32. switch (src_prc_) { - case ov::element::f32: - case ov::element::i32: - load_bytes(Vmm(out_vec_idx), reg_src, offset, load_size_); - break; - case ov::element::i8: - load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, true, load_size_); - break; - case ov::element::u8: - load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, false, load_size_); - break; - case ov::element::i16: - case ov::element::u16: - case ov::element::bf16: - case ov::element::f16: - load_words_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, src_prc_, load_size_); - break; - default: - OV_CPU_JIT_EMITTER_THROW("has unsupported src precision to load."); + case ov::element::f32: + case ov::element::i32: + load_bytes(Vmm(out_vec_idx), reg_src, offset, load_size_); + break; + case ov::element::i8: + load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, true, load_size_); + break; + case ov::element::u8: + load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, false, load_size_); + break; + case ov::element::i16: + case ov::element::u16: + case ov::element::bf16: + case ov::element::f16: + load_words_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, src_prc_, load_size_); + break; + default: + OV_CPU_JIT_EMITTER_THROW("has unsupported src precision to load."); } } // post convert between I32 and FP32 if (src_prc_ != dst_prc_) { switch (dst_prc_) { - case ov::element::f32: - if (!src_prc_.is_real()) - h->uni_vcvtdq2ps(Vmm(out_vec_idx), Vmm(out_vec_idx)); - break; - case ov::element::i32: - if (src_prc_.is_real()) { - h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx)); - } - break; - default: - break; + case ov::element::f32: + if (!src_prc_.is_real()) + h->uni_vcvtdq2ps(Vmm(out_vec_idx), Vmm(out_vec_idx)); + break; + case ov::element::i32: + if (src_prc_.is_real()) { + h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx)); + } + break; + default: + break; } } @@ -186,19 +211,19 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_i } /** -* load_bytes is the utility function to facilitate loading of -* load_size (0 <= load_size <= 64) many contiguous bytes into the Xmm/Ymm/Zmm -* register from the memory referenced by ptr[reg + offset] address. -* -* Functionally, invocation of load_bytes is equivalent to -* the following loop: -* -* for (int idx = 0; idx < load_size; ++idx) -* vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); -* -*/ + * load_bytes is the utility function to facilitate loading of + * load_size (0 <= load_size <= 64) many contiguous bytes into the Xmm/Ymm/Zmm + * register from the memory referenced by ptr[reg + offset] address. + * + * Functionally, invocation of load_bytes is equivalent to + * the following loop: + * + * for (int idx = 0; idx < load_size; ++idx) + * vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); + * + */ template -void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, int load_size) const { +void jit_load_emitter::load_bytes(const Vmm& vmm, const Xbyak::Reg64& reg, int offset, int load_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -249,14 +274,17 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o } // Cornerstone of partial load is combinaion of vpinsrb/w/d. - // As vpinsrb/w/d will not only write(insert) values into vmm, but also read values in vmm to copy from to positions that not in imm mask, - // this could introduce RAW false dependency(we actually do not care about values not in imm mask). - // To eliminate this false dependency, + // As vpinsrb/w/d will not only write(insert) values into vmm, but also read values in vmm to copy from to + // positions that not in imm mask, this could introduce RAW false dependency(we actually do not care about + // values not in imm mask). To eliminate this false dependency, // 1. For 1/2/3/4 bytes tails, replace vpinsrb/w/d with mov,shl etc instructions that don't read vmm. - // Besides eliminate RAW, these instructions have smaller latency, which also bring better perf, especially for small loop iteration case. + // Besides eliminate RAW, these instructions have smaller latency, which also bring better perf, especially + // for small loop iteration case. // 2. For 8/16 bytes, use vmovq/vmovdqu instructions to load, which also don't read src vmm. - // 3. For other size, insert vpxor before vpinsrb/w/d. vpxor and read vmm instructions in previous loop have WAR(write after read) relationship. - // CPU can identify this scenario and assign another physical vector register(register renameing) in next loop to eliminate RAW. + // 3. For other size, insert vpxor before vpinsrb/w/d. vpxor and read vmm instructions in previous loop have + // WAR(write after read) relationship. + // CPU can identify this scenario and assign another physical vector register(register renameing) in next + // loop to eliminate RAW. if (!one_of(bytes_to_load, 0, 1, 2, 3, 4, 8, 16)) { h->uni_vpxor(vmm, vmm, vmm); } @@ -266,121 +294,136 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o h->uni_vmovdqu(xmm, addr(start_bytes)); switch (bytes_to_load) { - case 0: break; - case 1: - h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes)); - h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); - break; - case 2: - h->movzx(Reg32(aux_gpr_idxs[0]), word_addr(start_bytes)); - h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); - break; - case 3: - h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes + 2)); - h->shl(Reg32(aux_gpr_idxs[0]), 16); - h->mov(Reg16(aux_gpr_idxs[0]), word_addr(start_bytes)); - h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); - break; - case 4: h->uni_vmovss(xmm, addr(start_bytes)); break; - case 5: - h->uni_vmovss(xmm, addr(start_bytes)); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 4), 4); - break; - case 6: - h->uni_vmovss(xmm, addr(start_bytes)); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); - break; - case 7: - h->uni_vmovss(xmm, addr(start_bytes)); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 6), 6); - break; - case 8: break; - case 9: h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 8), 8); break; - case 10: h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); break; - case 11: - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 10), 10); - break; - case 12: h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); break; - case 13: - h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 12), 12); - break; - case 14: - h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); - break; - case 15: - h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 14), 14); - break; - case 16: break; - default: - OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to load in load_byte."); + case 0: + break; + case 1: + h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes)); + h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); + break; + case 2: + h->movzx(Reg32(aux_gpr_idxs[0]), word_addr(start_bytes)); + h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); + break; + case 3: + h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes + 2)); + h->shl(Reg32(aux_gpr_idxs[0]), 16); + h->mov(Reg16(aux_gpr_idxs[0]), word_addr(start_bytes)); + h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); + break; + case 4: + h->uni_vmovss(xmm, addr(start_bytes)); + break; + case 5: + h->uni_vmovss(xmm, addr(start_bytes)); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 4), 4); + break; + case 6: + h->uni_vmovss(xmm, addr(start_bytes)); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); + break; + case 7: + h->uni_vmovss(xmm, addr(start_bytes)); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 6), 6); + break; + case 8: + break; + case 9: + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 8), 8); + break; + case 10: + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); + break; + case 11: + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 10), 10); + break; + case 12: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + break; + case 13: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 12), 12); + break; + case 14: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); + break; + case 15: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 14), 14); + break; + case 16: + break; + default: + OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to load in load_byte."); } if (has_xmm_block) { - h->vinsertf128(ymm, ymm, xmm, 1); // insert to upper bits of ymm + h->vinsertf128(ymm, ymm, xmm, 1); // insert to upper bits of ymm if (has_ymm_block) - h->vinsertf128(ymm, ymm, addr(32), 0); // insert to lower bits of ymm + h->vinsertf128(ymm, ymm, addr(32), 0); // insert to lower bits of ymm else - h->vinsertf128(ymm, ymm, addr(0), 0); // insert to lower bits of ymm + h->vinsertf128(ymm, ymm, addr(0), 0); // insert to lower bits of ymm } if (has_ymm_block) { - h->vinsertf64x4(zmm, zmm, ymm, 1); // insert to upper bits of zmm - h->vinsertf64x4(zmm, zmm, addr(0), 0); // insert to lower bits of zmm + h->vinsertf64x4(zmm, zmm, ymm, 1); // insert to upper bits of zmm + h->vinsertf64x4(zmm, zmm, addr(0), 0); // insert to lower bits of zmm } }; switch (load_size) { - case 64: - h->uni_vmovdqu(zmm, addr(0)); - break; - case 32: - h->uni_vmovdqu(ymm, addr(0)); - break; - case 16: - h->uni_vmovdqu(xmm, addr(0)); - break; - default: { - if (mayiuse(cpu::x64::avx512_core) && load_size > threshold_for_mask_emu_load) { - uint64_t mask = 1; - mask = (mask << load_size) - mask; - h->mov(Reg64(aux_gpr_idxs[0]), mask); - h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); - h->vmovdqu8(zmm | k_mask | T_z, addr(0)); - } else { - load_byte_base(); - } - break; + case 64: + h->uni_vmovdqu(zmm, addr(0)); + break; + case 32: + h->uni_vmovdqu(ymm, addr(0)); + break; + case 16: + h->uni_vmovdqu(xmm, addr(0)); + break; + default: { + if (mayiuse(cpu::x64::avx512_core) && load_size > threshold_for_mask_emu_load) { + uint64_t mask = 1; + mask = (mask << load_size) - mask; + h->mov(Reg64(aux_gpr_idxs[0]), mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vmovdqu8(zmm | k_mask | T_z, addr(0)); + } else { + load_byte_base(); } + break; + } } } /** -* load_bytes_to_dword_extension is the utility function to facilitate -* loading of load_size (0 <= load_size <= 16) many contiguous bytes in -* the xmm register from the memory referenced by ptr[reg + offset] -* address and then do signed/zero extension of those to double words. -* -* Functionally, invocation of load_bytes_to_dword_extension is equivalent -* to the following: -* -* for (int idx = 0; idx < load_size; ++idx) -* vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); -* if (is_signed) vpmovsxbd(vmm, vmm); else vpmovzxbd(vmm, vmm); -* -* Valid values for the load_size variable are: -* [0..4] for XMM version of the function, i.e. 4 bytes -> 4 * 32 bit == 128 bit -* [0..8] for YMM version of the function. i.e. 8 bytes -> 8 * 32 bit == 256 bit -* [0..16] for ZMM version of the function. i.e. 16 bytes -> 16 * 32 bit == 512 bit -*/ + * load_bytes_to_dword_extension is the utility function to facilitate + * loading of load_size (0 <= load_size <= 16) many contiguous bytes in + * the xmm register from the memory referenced by ptr[reg + offset] + * address and then do signed/zero extension of those to double words. + * + * Functionally, invocation of load_bytes_to_dword_extension is equivalent + * to the following: + * + * for (int idx = 0; idx < load_size; ++idx) + * vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); + * if (is_signed) vpmovsxbd(vmm, vmm); else vpmovzxbd(vmm, vmm); + * + * Valid values for the load_size variable are: + * [0..4] for XMM version of the function, i.e. 4 bytes -> 4 * 32 bit == 128 bit + * [0..8] for YMM version of the function. i.e. 8 bytes -> 8 * 32 bit == 256 bit + * [0..16] for ZMM version of the function. i.e. 16 bytes -> 16 * 32 bit == 512 bit + */ template -void jit_load_emitter::load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, bool is_signed, int load_size) const { +void jit_load_emitter::load_bytes_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + bool is_signed, + int load_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -401,76 +444,80 @@ void jit_load_emitter::load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak // For load_size == 4/8/16, do load/extension in one go switch (load_size) { - case 16: { - // full size of zmm - const auto zmm = Xbyak::Zmm(vmm.getIdx()); - if (is_signed) - h->uni_vpmovsxbd(zmm, ptr[reg + offset]); - else - h->uni_vpmovzxbd(zmm, ptr[reg + offset]); - break; - } - case 8: { - // full size of ymm or ymm_block of zmm - const auto ymm = Xbyak::Ymm(vmm.getIdx()); + case 16: { + // full size of zmm + const auto zmm = Xbyak::Zmm(vmm.getIdx()); + if (is_signed) + h->uni_vpmovsxbd(zmm, ptr[reg + offset]); + else + h->uni_vpmovzxbd(zmm, ptr[reg + offset]); + break; + } + case 8: { + // full size of ymm or ymm_block of zmm + const auto ymm = Xbyak::Ymm(vmm.getIdx()); + if (is_signed) + h->uni_vpmovsxbd(ymm, ptr[reg + offset]); + else + h->uni_vpmovzxbd(ymm, ptr[reg + offset]); + break; + } + case 4: { + // full size of xmm or xmm_block of ymm/zmm + const auto xmm = Xbyak::Xmm(vmm.getIdx()); + if (is_signed) + h->uni_vpmovsxbd(xmm, ptr[reg + offset]); + else + h->uni_vpmovzxbd(xmm, ptr[reg + offset]); + break; + } + default: { + if (is_zmm && load_size > threshold_for_mask_emu_load) { + unsigned int mask = 1; + mask = (mask << load_size) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); if (is_signed) - h->uni_vpmovsxbd(ymm, ptr[reg + offset]); + h->uni_vpmovsxbd(vmm | k_mask | T_z, ptr[reg + offset]); else - h->uni_vpmovzxbd(ymm, ptr[reg + offset]); - break; - } - case 4: { - // full size of xmm or xmm_block of ymm/zmm + h->uni_vpmovzxbd(vmm | k_mask | T_z, ptr[reg + offset]); + } else { const auto xmm = Xbyak::Xmm(vmm.getIdx()); + load_bytes(xmm, reg, offset, load_size); if (is_signed) - h->uni_vpmovsxbd(xmm, ptr[reg + offset]); + h->uni_vpmovsxbd(vmm, xmm); else - h->uni_vpmovzxbd(xmm, ptr[reg + offset]); - break; - } - default: { - if (is_zmm && load_size > threshold_for_mask_emu_load) { - unsigned int mask = 1; - mask = (mask << load_size) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); - if (is_signed) - h->uni_vpmovsxbd(vmm | k_mask | T_z, ptr[reg + offset]); - else - h->uni_vpmovzxbd(vmm | k_mask | T_z, ptr[reg + offset]); - } else { - const auto xmm = Xbyak::Xmm(vmm.getIdx()); - load_bytes(xmm, reg, offset, load_size); - if (is_signed) - h->uni_vpmovsxbd(vmm, xmm); - else - h->uni_vpmovzxbd(vmm, xmm); - } - break; + h->uni_vpmovzxbd(vmm, xmm); } + break; + } } } /** -* load_words_to_dword_extension is the utility function to facilitate -* loading of load_size (0 <= load_size <= 32) byte many contiguous words(num == load_size / 2) -* in the Vmm register from the memory referenced by ptr[reg + offset] -* address and then do signed/zero extension of those to double words. -* -* Functionally, invocation of load_words_to_dword_extension is equivalent -* to the following extended pseudo code: -* -* for (int idx = 0; idx < load_size / 2; ++idx) -* vpinsrw(vmm, vmm, ptr[reg + offset + 2 * idx], idx); -* if (is_signed) vpmovsxwd(vmm, vmm); else vpmovzxwd(vmm, vmm); -* -* Valid values for the load_size variable are: -* [0..8] for XMM version of the function. i.e. 4 words -> 4 * 32 bit == 128 bit -* [0..16] for YMM version of the function. i.e. 8 words -> 8 * 32 bit == 256 bit -* [0.. 32] for ZMM version of the function. i.e. 16 words -> 16 * 32 bit == 512 bit -*/ + * load_words_to_dword_extension is the utility function to facilitate + * loading of load_size (0 <= load_size <= 32) byte many contiguous words(num == load_size / 2) + * in the Vmm register from the memory referenced by ptr[reg + offset] + * address and then do signed/zero extension of those to double words. + * + * Functionally, invocation of load_words_to_dword_extension is equivalent + * to the following extended pseudo code: + * + * for (int idx = 0; idx < load_size / 2; ++idx) + * vpinsrw(vmm, vmm, ptr[reg + offset + 2 * idx], idx); + * if (is_signed) vpmovsxwd(vmm, vmm); else vpmovzxwd(vmm, vmm); + * + * Valid values for the load_size variable are: + * [0..8] for XMM version of the function. i.e. 4 words -> 4 * 32 bit == 128 bit + * [0..16] for YMM version of the function. i.e. 8 words -> 8 * 32 bit == 256 bit + * [0.. 32] for ZMM version of the function. i.e. 16 words -> 16 * 32 bit == 512 bit + */ template -void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, ov::element::Type prc, int load_size) const { +void jit_load_emitter::load_words_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + ov::element::Type prc, + int load_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -503,87 +550,87 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak // For load_size == 32/16/8, do load/extension in one go // including xmm/ymm tail block for ymm/zmm, so explicite xmm/ymm/zmm switch (load_size) { - case 32: { - if (is_bf16) { + case 32: { + if (is_bf16) { + h->uni_vpmovzxwd(zmm, ptr[reg + offset]); + h->uni_vpslld(zmm, zmm, 16); + } else if (is_f16) { + h->vcvtph2ps(zmm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(zmm, ptr[reg + offset]); + else h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - h->uni_vpslld(zmm, zmm, 16); - } else if (is_f16) { - h->vcvtph2ps(zmm, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(zmm, ptr[reg + offset]); - else - h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - } - break; } - case 16: { - if (is_bf16) { + break; + } + case 16: { + if (is_bf16) { + h->uni_vpmovzxwd(ymm, ptr[reg + offset]); + h->uni_vpslld(ymm, ymm, 16); + } else if (is_f16) { + h->vcvtph2ps(ymm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(ymm, ptr[reg + offset]); + else h->uni_vpmovzxwd(ymm, ptr[reg + offset]); - h->uni_vpslld(ymm, ymm, 16); + } + break; + } + case 8: { + if (is_bf16) { + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + h->uni_vpslld(xmm, xmm, 16); + } else if (is_f16) { + h->vcvtph2ps(xmm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(xmm, ptr[reg + offset]); + else + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + } + break; + } + default: { + if (is_zmm && load_size > threshold_for_mask_emu_load) { + unsigned int mask = 1; + mask = (mask << (load_size / 2)) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + if (is_bf16) { + h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); + h->uni_vpslld(vmm, vmm, 16); } else if (is_f16) { - h->vcvtph2ps(ymm, ptr[reg + offset]); + h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); } else { if (is_signed) - h->uni_vpmovsxwd(ymm, ptr[reg + offset]); + h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); else - h->uni_vpmovzxwd(ymm, ptr[reg + offset]); + h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); } - break; - } - case 8: { + } else { + // xmm or ymm version + load_bytes(xmm, reg, offset, load_size); if (is_bf16) { - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - h->uni_vpslld(xmm, xmm, 16); + h->uni_vpmovzxwd(vmm, xmm); + h->uni_vpslld(vmm, vmm, 16); } else if (is_f16) { - h->vcvtph2ps(xmm, ptr[reg + offset]); + h->vcvtph2ps(ymm, xmm); } else { if (is_signed) - h->uni_vpmovsxwd(xmm, ptr[reg + offset]); + h->uni_vpmovsxwd(vmm, xmm); else - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - } - break; - } - default: { - if (is_zmm && load_size > threshold_for_mask_emu_load) { - unsigned int mask = 1; - mask = (mask << (load_size / 2)) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); - if (is_bf16) { - h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); - h->uni_vpslld(vmm, vmm, 16); - } else if (is_f16) { - h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); - else - h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); - } - } else { - // xmm or ymm version - load_bytes(xmm, reg, offset, load_size); - if (is_bf16) { h->uni_vpmovzxwd(vmm, xmm); - h->uni_vpslld(vmm, vmm, 16); - } else if (is_f16) { - h->vcvtph2ps(ymm, xmm); - } else { - if (is_signed) - h->uni_vpmovsxwd(vmm, xmm); - else - h->uni_vpmovzxwd(vmm, xmm); - } } - break; } + break; + } } } template -void jit_load_emitter::fill_with_default(const Vmm &vmm, std::string fill_value, const int &load_num) const { +void jit_load_emitter::fill_with_default(const Vmm& vmm, std::string fill_value, const int& load_num) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -614,10 +661,20 @@ void jit_load_emitter::register_table_entries() { } /// STORE /// -jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, arithmetic_mode mode, ov::element::Type exec_prc, +jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, + arithmetic_mode mode, + ov::element::Type exec_prc, emitter_in_out_map in_out_type) - : jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), mode_(mode) { + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + store_num_(store_num), + src_prc_(src_prc), + dst_prc_(dst_prc), + mode_(mode) { prepare_table(); v_len_elt_ = get_vec_length() / exec_prc.size(); store_size_ = store_num * dst_prc.size(); @@ -630,17 +687,20 @@ inline bool jit_store_emitter::is_saturation() const { // case for SSE and AVX2 when we should use AND to truncate values inline bool jit_store_emitter::is_truncation_emulation() const { - return !mayiuse(cpu::x64::avx512_core) && !is_saturation() && - src_prc_ != dst_prc_ && one_of(dst_prc_, ov::element::u16, ov::element::i16, ov::element::u8, ov::element::i8); + return !mayiuse(cpu::x64::avx512_core) && !is_saturation() && src_prc_ != dst_prc_ && + one_of(dst_prc_, ov::element::u16, ov::element::i16, ov::element::u8, ov::element::i8); } size_t jit_store_emitter::aux_gprs_count() const { // for temp reg for store(mask version or special number cases) - const auto is_pure_store = (src_prc_ == dst_prc_) || - (one_of(src_prc_, ov::element::f32, ov::element::i32) && - one_of(dst_prc_, ov::element::f32, ov::element::i32)); + const auto is_pure_store = (src_prc_ == dst_prc_) || (one_of(src_prc_, ov::element::f32, ov::element::i32) && + one_of(dst_prc_, ov::element::f32, ov::element::i32)); const auto is_store_as_real16 = one_of(dst_prc_, ov::element::bf16, ov::element::f16); - int count = get_aux_regs_as_temp(store_num_, static_cast(dst_prc_.size()), is_pure_store, is_store_as_real16, threshold_for_mask_emu_store); + int count = get_aux_regs_as_temp(store_num_, + static_cast(dst_prc_.size()), + is_pure_store, + is_store_as_real16, + threshold_for_mask_emu_store); // for table value in truncation arithmetic mode if (is_truncation_emulation()) @@ -661,14 +721,17 @@ size_t jit_store_emitter::aux_vecs_count() const { if ((host_isa_ == cpu::x64::sse41) && (src_prc_ == ov::element::f32 && dst_prc_ == ov::element::bf16)) count++; - // zero value, zeroed and passed from caller from performance standpoint(zeroed one time and not need preserve and restore status) + // zero value, zeroed and passed from caller from performance standpoint(zeroed one time and not need preserve and + // restore status) if (mayiuse(cpu::x64::avx512_core) && one_of(dst_prc_, ov::element::u8, ov::element::u16)) count++; return count; } -size_t jit_store_emitter::get_inputs_num() const { return 1; } +size_t jit_store_emitter::get_inputs_num() const { + return 1; +} void jit_store_emitter::emit_data() const { jit_emitter::emit_data(); @@ -676,7 +739,7 @@ void jit_store_emitter::emit_data() const { uni_vcvtneps2bf16_->emit_data(); } -void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { // offset in store emitter is the offset of dst gpr register, should be parsed from out_idxs. const int offset = out_idxs.size() == 2 ? out_idxs[1] : 0; if (host_isa_ == cpu::x64::sse41) { @@ -691,7 +754,7 @@ void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std: } template -void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_dst, const int offset) const { +void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64& reg_dst, const int offset) const { bool matched_prc = (src_prc_ == dst_prc_) || (src_prc_ == ov::element::f32) || (src_prc_ == ov::element::i32); if (!matched_prc) { OV_CPU_JIT_EMITTER_THROW("only support input precision of FP32 or I32 or the same precision as output."); @@ -707,29 +770,29 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_d data_idx = in_vec_idx; data_reg_updated = false; if (!aux_vec_idxs.empty()) - aux_src_idx = aux_vec_idxs.back(); // to avoid src pollution + aux_src_idx = aux_vec_idxs.back(); // to avoid src pollution if (src_prc_ != dst_prc_) { switch (src_prc_) { - case ov::element::f32: - if (!dst_prc_.is_real()) { - if (is_saturation()) { - h->uni_vcvtps2dq(Vmm(aux_src_idx), Vmm(data_idx)); - } else { - h->uni_vcvttps2dq(Vmm(aux_src_idx), Vmm(data_idx)); - } - data_idx = aux_src_idx; - data_reg_updated = true; - } - break; - case ov::element::i32: - if (dst_prc_.is_real()) { - h->uni_vcvtdq2ps(Vmm(aux_src_idx), Vmm(data_idx)); - data_idx = aux_src_idx; - data_reg_updated = true; + case ov::element::f32: + if (!dst_prc_.is_real()) { + if (is_saturation()) { + h->uni_vcvtps2dq(Vmm(aux_src_idx), Vmm(data_idx)); + } else { + h->uni_vcvttps2dq(Vmm(aux_src_idx), Vmm(data_idx)); } - break; - default: - break; + data_idx = aux_src_idx; + data_reg_updated = true; + } + break; + case ov::element::i32: + if (dst_prc_.is_real()) { + h->uni_vcvtdq2ps(Vmm(aux_src_idx), Vmm(data_idx)); + data_idx = aux_src_idx; + data_reg_updated = true; + } + break; + default: + break; } } @@ -737,44 +800,44 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_d store_bytes(reg_dst, offset, store_size_); } else { switch (dst_prc_) { - case ov::element::f32: - case ov::element::i32: - store_bytes(reg_dst, offset, store_size_); - break; - case ov::element::i8: - store_dword_to_byte_extension(reg_dst, offset, true, store_num_); - break; - case ov::element::u8: - store_dword_to_byte_extension(reg_dst, offset, false, store_num_); - break; - case ov::element::i16: - case ov::element::u16: - case ov::element::bf16: - case ov::element::f16: - store_dword_to_word_extension(reg_dst, offset, dst_prc_, store_num_); - break; - default: - OV_CPU_JIT_EMITTER_THROW("has unsupported dst precision to store."); + case ov::element::f32: + case ov::element::i32: + store_bytes(reg_dst, offset, store_size_); + break; + case ov::element::i8: + store_dword_to_byte_extension(reg_dst, offset, true, store_num_); + break; + case ov::element::u8: + store_dword_to_byte_extension(reg_dst, offset, false, store_num_); + break; + case ov::element::i16: + case ov::element::u16: + case ov::element::bf16: + case ov::element::f16: + store_dword_to_word_extension(reg_dst, offset, dst_prc_, store_num_); + break; + default: + OV_CPU_JIT_EMITTER_THROW("has unsupported dst precision to store."); } } } /** -* store_bytes is the utility function to facilitate storing of -* store_size (0 <= store_size <= 64) many contiguous bytes from the Xmm/Ymm/Zmm -* register into the memory referenced by ptr[reg + offset] address. -* -* Additionally, when store_size > 16, the input Ymm register will not be -* preserved due to the usage of vextracti128 instruction. -* -* Functionally, invocation of store_bytes is equivalent -* to the following loop: -* -* for (int idx = 0; idx < store_size; ++idx) -* vpextrb(ptr[reg + offset + idx], vmm, idx); -* -*/ + * store_bytes is the utility function to facilitate storing of + * store_size (0 <= store_size <= 64) many contiguous bytes from the Xmm/Ymm/Zmm + * register into the memory referenced by ptr[reg + offset] address. + * + * Additionally, when store_size > 16, the input Ymm register will not be + * preserved due to the usage of vextracti128 instruction. + * + * Functionally, invocation of store_bytes is equivalent + * to the following loop: + * + * for (int idx = 0; idx < store_size; ++idx) + * vpextrb(ptr[reg + offset + idx], vmm, idx); + * + */ template -void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int store_size) const { +void jit_store_emitter::store_bytes(const Xbyak::Reg64& reg, int offset, int store_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -805,7 +868,7 @@ void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int sto int bytes_to_store = store_size; if (store_size > 32) { - h->uni_vmovdqu(addr(0), ymm); // store lower bits from zmm + h->uni_vmovdqu(addr(0), ymm); // store lower bits from zmm start_bytes += 32; bytes_to_store -= 32; // load upper bits from zmm into ymm @@ -813,7 +876,7 @@ void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int sto } if (bytes_to_store > 16) { - h->uni_vmovdqu(addr(start_bytes), xmm); // store lower bits from ymm + h->uni_vmovdqu(addr(start_bytes), xmm); // store lower bits from ymm start_bytes += 16; bytes_to_store -= 16; // load upper bits from ymm into xmm @@ -834,93 +897,108 @@ void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int sto h->mov(addr(start_bytes + bytes_offset), Reg8(gpr_idx, ext8bit)); }; switch (bytes_to_store) { - case 0: break; - case 1: - h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); - store_one_byte(0, aux_gpr_idxs[0]); - break; - case 2: - h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); - h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); - break; - case 3: - h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); - h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); - h->shr(Reg64(aux_gpr_idxs[0]), 16); - store_one_byte(2, aux_gpr_idxs[0]); - break; - case 4: h->uni_vmovss(addr(start_bytes), xmm); break; - case 5: - h->uni_vmovss(addr(start_bytes), xmm); - h->uni_vpextrb(addr(start_bytes + 4), xmm, 4); - break; - case 6: - h->uni_vmovss(addr(start_bytes), xmm); - h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); - break; - case 7: - h->uni_vmovss(addr(start_bytes), xmm); - h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); - h->uni_vpextrb(addr(start_bytes + 6), xmm, 6); - break; - case 8: break; - case 9: h->uni_vpextrb(addr(start_bytes + 8), xmm, 8); break; - case 10: h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); break; - case 11: - h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); - h->uni_vpextrb(addr(start_bytes + 10), xmm, 10); - break; - case 12: h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); break; - case 13: - h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); - h->uni_vpextrb(addr(start_bytes + 12), xmm, 12); - break; - case 14: - h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); - h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); - break; - case 15: - h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); - h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); - h->uni_vpextrb(addr(start_bytes + 14), xmm, 14); - break; - case 16: break; - default: - OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to store in store_bytes."); - } - }; - - switch (store_size) { - case 64: - h->uni_vmovdqu(addr(0), zmm); + case 0: + break; + case 1: + h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); + store_one_byte(0, aux_gpr_idxs[0]); + break; + case 2: + h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); + h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); + break; + case 3: + h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); + h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); + h->shr(Reg64(aux_gpr_idxs[0]), 16); + store_one_byte(2, aux_gpr_idxs[0]); + break; + case 4: + h->uni_vmovss(addr(start_bytes), xmm); + break; + case 5: + h->uni_vmovss(addr(start_bytes), xmm); + h->uni_vpextrb(addr(start_bytes + 4), xmm, 4); + break; + case 6: + h->uni_vmovss(addr(start_bytes), xmm); + h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); + break; + case 7: + h->uni_vmovss(addr(start_bytes), xmm); + h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); + h->uni_vpextrb(addr(start_bytes + 6), xmm, 6); + break; + case 8: break; - case 32: - h->uni_vmovdqu(addr(0), ymm); + case 9: + h->uni_vpextrb(addr(start_bytes + 8), xmm, 8); + break; + case 10: + h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); + break; + case 11: + h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); + h->uni_vpextrb(addr(start_bytes + 10), xmm, 10); + break; + case 12: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + break; + case 13: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrb(addr(start_bytes + 12), xmm, 12); + break; + case 14: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); + break; + case 15: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); + h->uni_vpextrb(addr(start_bytes + 14), xmm, 14); break; case 16: - h->uni_vmovdqu(addr(0), xmm); break; default: - if (mayiuse(cpu::x64::avx512_core) && store_size > threshold_for_mask_emu_store) { - uint64_t mask = 1; - mask = (mask << store_size) - mask; - h->mov(Reg64(aux_gpr_idxs[0]), mask); - h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); - h->vmovdqu8(addr(0), zmm | k_mask); - } else { - store_byte_base(); - } - break; + OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to store in store_bytes."); + } + }; + + switch (store_size) { + case 64: + h->uni_vmovdqu(addr(0), zmm); + break; + case 32: + h->uni_vmovdqu(addr(0), ymm); + break; + case 16: + h->uni_vmovdqu(addr(0), xmm); + break; + default: + if (mayiuse(cpu::x64::avx512_core) && store_size > threshold_for_mask_emu_store) { + uint64_t mask = 1; + mask = (mask << store_size) - mask; + h->mov(Reg64(aux_gpr_idxs[0]), mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vmovdqu8(addr(0), zmm | k_mask); + } else { + store_byte_base(); + } + break; } } /** -* store_dword_to_byte_extension is the utility function to -* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes, singed or unsinged, truncated or saturated. -* 2. store the packed byte into the memory referenced by ptr[reg + offset] address. -*/ + * store_dword_to_byte_extension is the utility function to + * 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes, singed or unsinged, + * truncated or saturated. + * 2. store the packed byte into the memory referenced by ptr[reg + offset] address. + */ template -void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64 ®, int offset, bool is_signed, int store_num) const { +void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64& reg, + int offset, + bool is_signed, + int store_num) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -1032,7 +1110,7 @@ void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64 ®, i break; case 4: if (mayiuse(cpu::x64::avx512_core)) { - if (is_saturation()) { // xmm block on avx512F + VL + if (is_saturation()) { // xmm block on avx512F + VL if (is_signed) { h->vpmovsdb(addr(0), xmm); } else { @@ -1074,13 +1152,16 @@ void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64 ®, i } /** -* store_dword_to_word_extension is the utility function to -* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num words with singed or unsinged saturation. -* 2. store the packed words into the memory referenced by ptr[reg + offset] address. -*/ + * store_dword_to_word_extension is the utility function to + * 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num words with singed or unsinged + * saturation. + * 2. store the packed words into the memory referenced by ptr[reg + offset] address. + */ template -void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, - int offset, ov::element::Type precision, int store_num) const { +void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64& reg, + int offset, + ov::element::Type precision, + int store_num) const { const bool is_bf16 = (precision == ov::element::bf16); const bool is_f16 = (precision == ov::element::f16); const bool is_signed = precision.is_signed(); @@ -1151,7 +1232,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, if (is_bf16) { if (mayiuse(cpu::x64::avx512_core)) { - // to avoid src vmm pollution, this check means no precision convert happens, so data_idx is still original_data_idx. + // to avoid src vmm pollution, this check means no precision convert happens, so data_idx is still + // original_data_idx. if (src_prc_ == ov::element::f32) { ymm = Ymm(aux_vec_idxs[0]); } @@ -1171,7 +1253,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, if (host_isa_ == cpu::x64::sse41 && src_prc_ == ov::element::f32) { auto xmm_aux1 = Xmm(aux_vec_idxs[1]); h->uni_vmovups(xmm_aux1, vmm); - uni_vcvtneps2bf16_->emit_code({static_cast(vmm.getIdx())}, {static_cast(vmm.getIdx())}, + uni_vcvtneps2bf16_->emit_code({static_cast(vmm.getIdx())}, + {static_cast(vmm.getIdx())}, {static_cast(xmm.getIdx())}); h->uni_vmovups(xmm, vmm); h->uni_vmovups(vmm, xmm_aux1); // return original data to src vmm @@ -1222,7 +1305,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, Vmm zero(aux_vec_idxs[0]); h->uni_vpxor(zero, zero, zero); STORE_KEEP_SOURCE(uni_vpmaxsd, vmm, Vmm(aux_src_idx), vmm, zero); - h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16. + h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16. } } else { h->vpmovdw(ptr[reg + offset], vmm); @@ -1261,7 +1344,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, h->vpmovdw(ptr[reg + offset], xmm); } } else { - store_dword_to_word_base(); + store_dword_to_word_base(); } break; default: @@ -1297,5 +1380,5 @@ void jit_store_emitter::register_table_entries() { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp index 9570a836aa64ee..2c4e15ccaeb28b 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp @@ -4,16 +4,23 @@ #pragma once -#include "jit_emitter.hpp" #include "jit_bf16_emitters.hpp" +#include "jit_emitter.hpp" namespace ov { namespace intel_cpu { struct load_emitter_params : public emitter_params { - load_emitter_params(ov::element::Type src_prc, ov::element::Type dst_prc, - int load_num, bool is_fill = false, std::string fill_value = "zero"): - src_prc_(src_prc), dst_prc_(dst_prc), load_num_(load_num), is_fill_(is_fill), fill_value_(fill_value) {} + load_emitter_params(ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + bool is_fill = false, + std::string fill_value = "zero") + : src_prc_(src_prc), + dst_prc_(dst_prc), + load_num_(load_num), + is_fill_(is_fill), + fill_value_(fill_value) {} size_t hash() const override; @@ -25,8 +32,10 @@ struct load_emitter_params : public emitter_params { }; struct store_emitter_params : public emitter_params { - store_emitter_params(ov::element::Type src_prc, ov::element::Type dst_prc, int store_num): - src_prc_(src_prc), dst_prc_(dst_prc), store_num_(store_num) {} + store_emitter_params(ov::element::Type src_prc, ov::element::Type dst_prc, int store_num) + : src_prc_(src_prc), + dst_prc_(dst_prc), + store_num_(store_num) {} size_t hash() const override; @@ -36,57 +45,61 @@ struct store_emitter_params : public emitter_params { }; // Arithmetic modes for data type conversion in store_emitter -enum arithmetic_mode { - saturation, - truncation -}; +enum arithmetic_mode { saturation, truncation }; class jit_load_emitter : public jit_emitter { public: - jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, + jit_load_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, ov::element::Type exec_prc = ov::element::f32, - bool is_fill = false, std::string fill_value = "zero", + bool is_fill = false, + std::string fill_value = "zero", emitter_in_out_map in_out_type = emitter_in_out_map::gpr_to_vec); /** - * load_num values with src_prc precision are loaded from ptr[Reg64(in_idxs[0]) + offset_byte] address to Vmm[out_idxs[0]] as dst_prc, where offset_byte is in_idxs[1] - * is_fill: when load_num can not fully fit in vector register, whether fill_value should be filled as default values. - * fill_value: when load_num can not fully fit in vector register, what values should be filled as default values. - * currently support "zero", "int_one", "float_one", "int32_min", "float_min", "int32_max" and "float_max". - * supported src_prc and dst_prc pairs are as below(x indicate for support): - * FP32 I32 I16 U16 I8 U8 BF16 --> src_prc - * FP32 x x x x x x x - * I32 x x x x x x x - * I16 x - * U16 x - * I8 x - * U8 x - * BF16 x - * | - * \|/ - * dst_prc - */ + * load_num values with src_prc precision are loaded from ptr[Reg64(in_idxs[0]) + offset_byte] address to + * Vmm[out_idxs[0]] as dst_prc, where offset_byte is in_idxs[1] is_fill: when load_num can not fully fit in vector + * register, whether fill_value should be filled as default values. fill_value: when load_num can not fully fit in + * vector register, what values should be filled as default values. currently support "zero", "int_one", + * "float_one", "int32_min", "float_min", "int32_max" and "float_max". supported src_prc and dst_prc pairs are as + * below(x indicate for support): FP32 I32 I16 U16 I8 U8 BF16 --> src_prc FP32 x x x x + * x x x I32 x x x x x x x I16 x U16 x I8 x U8 + * x BF16 x + * | + * \|/ + * dst_prc + */ // offset in load emitter is the offset of src gpr register, should be parsed from in_idxs. - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; size_t get_inputs_num() const override; private: template - void emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_idx, const int offset) const; + void emit_isa(const Xbyak::Reg64& reg_src, const int out_vec_idx, const int offset) const; template - void load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, int load_size) const; + void load_bytes(const Vmm& vmm, const Xbyak::Reg64& reg, int offset, int load_size) const; template - void load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, bool is_signed, int load_size) const; + void load_bytes_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + bool is_signed, + int load_size) const; template - void load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, ov::element::Type prc, int load_size) const; + void load_words_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + ov::element::Type prc, + int load_size) const; template - void fill_with_default(const Vmm &vmm, std::string fill_value, const int &load_num) const; + void fill_with_default(const Vmm& vmm, std::string fill_value, const int& load_num) const; void register_table_entries() override; @@ -104,30 +117,27 @@ class jit_load_emitter : public jit_emitter { class jit_store_emitter : public jit_emitter { public: - jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, + jit_store_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, arithmetic_mode mode = arithmetic_mode::saturation, ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr); /** - * store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, where offset_byte is in_idxs[1] - * supported src_prc and dst_prc pairs are as below(x indicate for support): - * FP32 I32 I16 U16 I8 U8 BF16 --> src_prc - * FP32 x x - * I32 x x - * I16 x x x - * U16 x x x - * I8 x x x - * U8 x x x - * BF16 x* x* x - * \|/ - * dst_prc - * note: FP32/I32-->BF16(x*) is supported only on at least avx512-core plateform - */ + * store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, + * where offset_byte is in_idxs[1] supported src_prc and dst_prc pairs are as below(x indicate for support): FP32 + * I32 I16 U16 I8 U8 BF16 --> src_prc FP32 x x I32 x x I16 x x x U16 x x + * x I8 x x x U8 x x x BF16 x* x* x + * \|/ + * dst_prc + * note: FP32/I32-->BF16(x*) is supported only on at least avx512-core plateform + */ // offset in store emitter is the offset of dst gpr register, should be parsed from out_idxs. - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; size_t get_inputs_num() const override; @@ -139,16 +149,19 @@ class jit_store_emitter : public jit_emitter { private: template - void emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_dst, const int offset) const; + void emit_isa(const int in_vec_idx, const Xbyak::Reg64& reg_dst, const int offset) const; template - void store_bytes(const Xbyak::Reg64 ®, int offset, int store_size) const; + void store_bytes(const Xbyak::Reg64& reg, int offset, int store_size) const; template - void store_dword_to_byte_extension(const Xbyak::Reg64 ®, int offset, bool is_signed, int store_size) const; + void store_dword_to_byte_extension(const Xbyak::Reg64& reg, int offset, bool is_signed, int store_size) const; template - void store_dword_to_word_extension(const Xbyak::Reg64 ®, int offset, ov::element::Type precision, int store_size) const; + void store_dword_to_word_extension(const Xbyak::Reg64& reg, + int offset, + ov::element::Type precision, + int store_size) const; void register_table_entries() override; @@ -176,5 +189,5 @@ class jit_store_emitter : public jit_emitter { mutable int aux_src_idx = 0; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp index ea16122f2f9793..420e9691ebc73c 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp @@ -21,8 +21,21 @@ EmitABIRegSpills::~EmitABIRegSpills() { void EmitABIRegSpills::preamble() { // gprs - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; + Xbyak::Operand gprs_to_save[] = {h->r8, + h->r9, + h->r10, + h->r11, + h->r12, + h->r13, + h->r14, + h->r15, + h->rax, + h->rbx, + h->rcx, + h->rdx, + h->rdi, + h->rsi, + h->rbp}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -75,8 +88,21 @@ void EmitABIRegSpills::postamble() { } // restore gpr registers - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; + Xbyak::Operand gprs_to_save[] = {h->r8, + h->r9, + h->r10, + h->r11, + h->r12, + h->r13, + h->r14, + h->r15, + h->rax, + h->rbx, + h->rcx, + h->rdx, + h->rdi, + h->rsi, + h->rbp}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); for (int i = n_gprs_to_save - 1; i >= 0; --i) h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); @@ -113,13 +139,17 @@ void EmitABIRegSpills::rsp_restore() { cpu_isa_t EmitABIRegSpills::get_isa() { // need preserve based on cpu capability, instead of host isa. // in case there are possibilty that different isa emitters exist in one kernel from perf standpoint in the future. - // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg, ZMM. - // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted. - if (mayiuse(avx512_core)) return avx512_core; - if (mayiuse(avx2)) return avx2; - if (mayiuse(sse41)) return sse41; + // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may + // use avx512 and spoil k-reg, ZMM. do not care about platform w/ avx512_common but w/o avx512_core(knight landing), + // which is obsoleted. + if (mayiuse(avx512_core)) + return avx512_core; + if (mayiuse(avx2)) + return avx2; + if (mayiuse(sse41)) + return sse41; OV_CPU_JIT_EMITTER_THROW("unsupported isa"); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp index 16a66beba7a536..ba956f3375f054 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp @@ -30,11 +30,15 @@ class EmitABIRegSpills { static dnnl::impl::cpu::x64::cpu_isa_t get_isa(); - inline size_t get_max_vecs_count() const { return dnnl::impl::cpu::x64::isa_num_vregs(isa); } - inline size_t get_vec_length() const { return dnnl::impl::cpu::x64::isa_max_vlen(isa); } + inline size_t get_max_vecs_count() const { + return dnnl::impl::cpu::x64::isa_num_vregs(isa); + } + inline size_t get_vec_length() const { + return dnnl::impl::cpu::x64::isa_max_vlen(isa); + } - dnnl::impl::cpu::x64::jit_generator* h {nullptr}; - const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::cpu_isa_t::isa_undef}; + dnnl::impl::cpu::x64::jit_generator* h{nullptr}; + const dnnl::impl::cpu::x64::cpu_isa_t isa{dnnl::impl::cpu::x64::cpu_isa_t::isa_undef}; static constexpr int k_mask_size = 8; static constexpr int k_mask_num = 8; @@ -44,5 +48,5 @@ class EmitABIRegSpills { bool rsp_status = true; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp index a56c2316183643..95698f8ac78bb0 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp @@ -2,74 +2,76 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/snippets_isa.hpp" #include "cpu_generator.hpp" -#include "jit_snippets_emitters.hpp" -#include "emitters/utils.hpp" -#include "emitters/snippets/cpu_runtime_configurator.hpp" -#include "emitters/plugin/aarch64/jit_eltwise_emitters.hpp" + #include "emitters/plugin/aarch64/jit_conversion_emitters.hpp" +#include "emitters/plugin/aarch64/jit_eltwise_emitters.hpp" +#include "emitters/snippets/aarch64/jit_fill_emitter.hpp" #include "emitters/snippets/aarch64/jit_kernel_emitter.hpp" #include "emitters/snippets/aarch64/jit_loop_emitters.hpp" #include "emitters/snippets/aarch64/jit_memory_emitters.hpp" -#include "emitters/snippets/aarch64/jit_fill_emitter.hpp" - -#include "transformations/snippets/common/op/fused_mul_add.hpp" -#include "transformations/cpu_opset/common/op/swish_cpu.hpp" - +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/utils.hpp" +#include "jit_snippets_emitters.hpp" #include "openvino/opsets/opset13.hpp" +#include "snippets/snippets_isa.hpp" +#include "transformations/cpu_opset/common/op/swish_cpu.hpp" +#include "transformations/snippets/common/op/fused_mul_add.hpp" namespace ov { -#define CREATE_SNIPPETS_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +#define CREATE_SNIPPETS_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_CPU_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr->get_node()); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +#define CREATE_CPU_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr->get_node()); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_GELU_V7_EMITTER(e_type_erf, e_type_tanh) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - const auto& n = expr->get_node(); \ - const auto& gelu = std::dynamic_pointer_cast(n); \ - if (gelu == nullptr) { \ - OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ - } \ - const auto approximationMode = gelu->get_approximation_mode(); \ - if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ - return std::make_shared(h.get(), isa, n); \ - } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ - return std::make_shared(h.get(), isa, n); \ - } else { \ - OPENVINO_THROW("Unsupported Gelu approximation mode"); \ - } \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - const auto& gelu = std::dynamic_pointer_cast(n); \ - if (gelu == nullptr) { \ - OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ - } \ - const auto approximationMode = gelu->get_approximation_mode(); \ - if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ - return e_type_erf::get_supported_precisions(n); \ - } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ - return e_type_tanh::get_supported_precisions(n); \ - } else { \ - OPENVINO_THROW("Unsupported Gelu approximation mode"); \ - } \ - } \ -} +#define CREATE_GELU_V7_EMITTER(e_type_erf, e_type_tanh) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + const auto& n = expr->get_node(); \ + const auto& gelu = std::dynamic_pointer_cast(n); \ + if (gelu == nullptr) { \ + OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ + } \ + const auto approximationMode = gelu->get_approximation_mode(); \ + if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ + return std::make_shared(h.get(), isa, n); \ + } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ + return std::make_shared(h.get(), isa, n); \ + } else { \ + OPENVINO_THROW("Unsupported Gelu approximation mode"); \ + } \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + const auto& gelu = std::dynamic_pointer_cast(n); \ + if (gelu == nullptr) { \ + OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ + } \ + const auto approximationMode = gelu->get_approximation_mode(); \ + if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ + return e_type_erf::get_supported_precisions(n); \ + } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ + return e_type_tanh::get_supported_precisions(n); \ + } else { \ + OPENVINO_THROW("Unsupported Gelu approximation mode"); \ + } \ + } \ + } class jit_snippet : public dnnl::impl::cpu::aarch64::jit_generator { public: @@ -85,7 +87,8 @@ class jit_snippet : public dnnl::impl::cpu::aarch64::jit_generator { namespace intel_cpu { namespace aarch64 { -CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) : h_compiled(std::move(h)) { +CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) + : h_compiled(std::move(h)) { OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); } @@ -102,15 +105,19 @@ bool CompiledSnippetCPU::empty() const { } CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa) - : TargetMachine(std::make_shared()), h(new jit_snippet()), isa(host_isa) { + : TargetMachine(std::make_shared()), + h(new jit_snippet()), + isa(host_isa) { // data movement jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[snippets::op::BroadcastMove::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_broadcast_move_emitter); - jitters[snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_CPU_EMITTER(jit_convert_truncation_emitter); - jitters[snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_CPU_EMITTER(jit_convert_saturation_emitter); + jitters[snippets::op::ConvertTruncation::get_type_info_static()] = + CREATE_CPU_EMITTER(jit_convert_truncation_emitter); + jitters[snippets::op::ConvertSaturation::get_type_info_static()] = + CREATE_CPU_EMITTER(jit_convert_saturation_emitter); // memory access jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_load_memory_emitter); @@ -136,7 +143,8 @@ CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa) jitters[ov::op::v0::Exp::get_type_info_static()] = CREATE_CPU_EMITTER(jit_exp_emitter); jitters[ov::op::v0::Floor::get_type_info_static()] = CREATE_CPU_EMITTER(jit_floor_emitter); jitters[ov::op::v0::Gelu::get_type_info_static()] = CREATE_CPU_EMITTER(jit_gelu_erf_emitter); - jitters[ov::op::v7::Gelu::get_type_info_static()] = CREATE_GELU_V7_EMITTER(jit_gelu_erf_emitter, jit_gelu_tanh_emitter); + jitters[ov::op::v7::Gelu::get_type_info_static()] = + CREATE_GELU_V7_EMITTER(jit_gelu_erf_emitter, jit_gelu_tanh_emitter); jitters[ov::op::v4::HSwish::get_type_info_static()] = CREATE_CPU_EMITTER(jit_hswish_emitter); jitters[ov::op::v4::Mish::get_type_info_static()] = CREATE_CPU_EMITTER(jit_mish_emitter); jitters[ov::op::v0::Relu::get_type_info_static()] = CREATE_CPU_EMITTER(jit_relu_emitter); @@ -168,7 +176,8 @@ bool CPUTargetMachine::is_supported() const { snippets::CompiledSnippetPtr CPUTargetMachine::get_snippet() { OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()"); - const auto& result = std::make_shared(std::unique_ptr(h.release())); + const auto& result = + std::make_shared(std::unique_ptr(h.release())); // Note that we reset all the generated code, since it was copied into CompiledSnippetCPU h.reset(new jit_snippet()); return result; @@ -176,8 +185,10 @@ snippets::CompiledSnippetPtr CPUTargetMachine::get_snippet() { size_t CPUTargetMachine::get_lanes() const { switch (isa) { - case dnnl::impl::cpu::aarch64::asimd : return dnnl::impl::cpu::aarch64::cpu_isa_traits::vlen / sizeof(float); - default : OPENVINO_THROW("unknown isa ", isa); + case dnnl::impl::cpu::aarch64::asimd: + return dnnl::impl::cpu::aarch64::cpu_isa_traits::vlen / sizeof(float); + default: + OPENVINO_THROW("unknown isa ", isa); } } @@ -190,18 +201,19 @@ dnnl::impl::cpu::aarch64::cpu_isa_t CPUTargetMachine::get_isa() const { return isa; } -CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_) : Generator(std::make_shared(isa_)) {} +CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_) + : Generator(std::make_shared(isa_)) {} std::shared_ptr CPUGenerator::clone() const { const auto& cpu_target_machine = std::dynamic_pointer_cast(target); - OPENVINO_ASSERT(cpu_target_machine, "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); + OPENVINO_ASSERT(cpu_target_machine, + "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); return std::make_shared(cpu_target_machine->get_isa()); } ov::snippets::RegType CPUGenerator::get_specific_op_out_reg_type(const ov::Output& out) const { const auto op = out.get_node_shared_ptr(); - if (std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op)) + if (std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)) return ov::snippets::RegType::vec; else return ov::snippets::RegType::undefined; @@ -211,6 +223,6 @@ bool CPUGenerator::uses_precompiled_kernel(const std::shared_ptr& out) const override; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp index 2b6056e92644a3..053cebe747e529 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp @@ -3,6 +3,7 @@ // #include "jit_fill_emitter.hpp" + #include "cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h" #include "emitters/utils.hpp" @@ -21,7 +22,8 @@ jit_fill_emitter::jit_fill_emitter(jit_generator* h, cpu_isa_t isa, const Expres const auto fill = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(fill != nullptr, "Expects Fill expression"); OV_CPU_JIT_EMITTER_ASSERT(fill->get_element_type().size() == 4, - "Supports only 4 Byte element types but gets ", fill->get_element_type()); + "Supports only 4 Byte element types but gets ", + fill->get_element_type()); offset = fill->get_offset(); fill_value = fill->get_fill_value(); @@ -38,8 +40,7 @@ size_t jit_fill_emitter::get_aux_gprs_count() const { return 1; } -void jit_fill_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_fill_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -48,7 +49,7 @@ void jit_fill_emitter::emit_impl(const std::vector& in, } template -void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_fill_emitter::emit_isa(const std::vector& in, const std::vector& out) const { if (is_full_reg()) fill_full(out); else @@ -56,7 +57,7 @@ void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector } template -void jit_fill_emitter::fill_full(const std::vector &out) const { +void jit_fill_emitter::fill_full(const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg dst = TReg(out[0]); @@ -71,28 +72,28 @@ void jit_fill_emitter::fill_full(const std::vector &out) const { } template -void jit_fill_emitter::fill_tail(const std::vector &in, const std::vector &out) const { +void jit_fill_emitter::fill_tail(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg dst = TReg(out[0]); switch (offset) { - case 1: - h->ld1(dst.s[1], table_val2("value", sizeof(float))); - h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); - break; - case 2: - h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); - break; - case 3: - h->ld1(dst.s[3], table_val2("value", 3 * sizeof(float))); - break; - case 4: - break; - default: - OV_CPU_JIT_EMITTER_THROW("Fill emitter has unexpected offset ", offset); + case 1: + h->ld1(dst.s[1], table_val2("value", sizeof(float))); + h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); + break; + case 2: + h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); + break; + case 3: + h->ld1(dst.s[3], table_val2("value", 3 * sizeof(float))); + break; + case 4: + break; + default: + OV_CPU_JIT_EMITTER_THROW("Fill emitter has unexpected offset ", offset); } } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp index 0ce0ac62d03979..7c827e4920d5eb 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp @@ -16,29 +16,34 @@ class jit_fill_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } protected: size_t get_aux_gprs_count() const override; private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; template - void fill_full(const std::vector &out) const; + void fill_full(const std::vector& out) const; template - void fill_tail(const std::vector &in, const std::vector &out) const; + void fill_tail(const std::vector& in, const std::vector& out) const; - bool is_full_reg() const { return offset == 0; } - bool is_optimized() const { return is_full_reg() && fill_value == uint32_t(0x0); } + bool is_full_reg() const { + return offset == 0; + } + bool is_optimized() const { + return is_full_reg() && fill_value == uint32_t(0x0); + } size_t offset = 0; uint32_t fill_value = 0x0; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp index 8f7a54dc9ebdb3..32ed1a844b6724 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp @@ -19,18 +19,25 @@ using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; inline static std::vector transform_idxs_to_regs(const std::vector& idxs) { std::vector regs(idxs.size(), XReg(0)); - std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return XReg(idx);}); + std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx) { + return XReg(idx); + }); return regs; } inline static std::vector transform_snippets_regs_to_idxs(const std::vector& regs) { std::vector idxs(regs.size()); - std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { return reg.idx; }); + std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { + return reg.idx; + }); return idxs; } -jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), reg_runtime_params_idx(Operand::X0) { +jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_emitter(h, isa), + reg_runtime_params_idx(Operand::X0) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "Invoked with invalid op argument"); OV_CPU_JIT_EMITTER_ASSERT(!kernel->region->empty(), "Invoked with empty body"); @@ -113,35 +120,50 @@ void jit_kernel_emitter::init_reg_pools(const std::set& gpr_blacklist, c gp_regs_pool[i] = vec_regs_pool[i] = 31 - i; auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { // It's important to keep the order of other elements - pool.erase(std::remove_if(pool.begin(), pool.end(), - [&](size_t x) {return to_remove.count(x) != 0;}), pool.end()); + pool.erase(std::remove_if(pool.begin(), + pool.end(), + [&](size_t x) { + return to_remove.count(x) != 0; + }), + pool.end()); }; - std::set gprs_blacklist_extended{Operand::X18, Operand::X23, Operand::X24, Operand::X28, Operand::X29, Operand::SP}; + std::set gprs_blacklist_extended{Operand::X18, + Operand::X23, + Operand::X24, + Operand::X28, + Operand::X29, + Operand::SP}; gprs_blacklist_extended.insert(gpr_blacklist.begin(), gpr_blacklist.end()); // Reserve reg_indexes_idx and reg_runtime_params_idx, since they'll be used to pass runtime call args to kernel remove_regs_from_pool(gp_regs_pool, gprs_blacklist_extended); remove_regs_from_pool(vec_regs_pool, vec_blacklist); } -void jit_kernel_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_kernel_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } -void jit_kernel_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_kernel_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty() && out.empty(), ": Expects 0 registers on input and output"); const auto num_params = num_inputs + num_outputs + num_unique_buffers; // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount OV_CPU_JIT_EMITTER_ASSERT(data_ptr_regs_idx.size() == num_params, - "Number of inputs and outputs is inconsistent with the number of allocated registers ", num_params, - " data_ptr_regs_idx.size() = ", data_ptr_regs_idx.size()); + "Number of inputs and outputs is inconsistent with the number of allocated registers ", + num_params, + " data_ptr_regs_idx.size() = ", + data_ptr_regs_idx.size()); } void jit_kernel_emitter::init_body_regs(const std::set& kernel_regs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) { // Initialize pools of gp and vec registers - // Reserve kernel regs (reg_indexes_idx and, if there is, reg_runtime_params_idx), since they'll be used to pass runtime call args to kernel + // Reserve kernel regs (reg_indexes_idx and, if there is, reg_runtime_params_idx), since they'll be used to pass + // runtime call args to kernel init_reg_pools(kernel_regs, {}); mapping_info gpr_map_pool({}, gp_regs_pool); @@ -175,9 +197,11 @@ void jit_kernel_emitter::emit_impl(const std::vector& in, const std::vec h->postamble(); } -jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_kernel_emitter(h, isa, expr), reg_indexes_idx(Operand::X1) { + : jit_kernel_emitter(h, isa, expr), + reg_indexes_idx(Operand::X1) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "Expectes KernelStatic expression"); jcp = *reinterpret_cast(kernel->compile_params); @@ -219,24 +243,29 @@ void jit_kernel_static_emitter::init_data_pointers(const std::vector& data // NOTE: Snippets Buffer Scratchpad has the common data pointer for all Buffers (even with different ID). // The accessing memory is covered by correct offsets in each Buffer and the corresponding MemoryAccess ops for (size_t i = 0; i < num_unique_buffers; i++) { - h->ldr(data_ptr_regs[num_params + i], ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); + h->ldr(data_ptr_regs[num_params + i], + ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); } for (size_t i = 0; i < num_params; i++) { if (i < num_inputs) - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); else - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); init_ptr_with_offset(data_ptr_regs[i], data_offsets[i]); } } -jit_kernel_dynamic_emitter::jit_kernel_dynamic_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_kernel_dynamic_emitter::jit_kernel_dynamic_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) : jit_kernel_emitter(h, isa, expr) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel, "Expectes KernelDynamic expression"); - // - Reserve reg_runtime_params_idx, since it wll be used to pass runtime call args to all dynamic emitters that needs runtime args + // - Reserve reg_runtime_params_idx, since it wll be used to pass runtime call args to all dynamic emitters that + // needs runtime args // - We cannot assign this register to the body emitters since runtime params MUST be valid during whole execution // for all dynamic emitters init_body_regs({reg_runtime_params_idx}); @@ -247,16 +276,19 @@ void jit_kernel_dynamic_emitter::init_data_pointers(const std::vector& dat const auto num_params = num_inputs + num_outputs; for (size_t i = 0; i < num_unique_buffers; ++i) { - h->ldr(data_ptr_regs[num_params + i], ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); + h->ldr(data_ptr_regs[num_params + i], + ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); } for (size_t i = 0; i < num_params; i++) { if (i < num_inputs) - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); else - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); } } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp index 63bac54e5c1f26..0ede91f100f110 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp @@ -5,8 +5,8 @@ #pragma once #include "emitters/plugin/aarch64/jit_emitter.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" #include "emitters/snippets/jit_container_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" namespace ov { namespace intel_cpu { @@ -15,8 +15,9 @@ namespace aarch64 { /// /// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register /// mapping and creates a pools of available gpr and vec registers. Kernel usually contains (at least one) -/// jit_loop_begin_emitter and jit_loop_end_emitter pair. In general the enclosed emitters should be organized in the following way: -/// jit_kernel_emitter { /* entry point, maps registers, creates pools of available registers */ +/// jit_loop_begin_emitter and jit_loop_end_emitter pair. In general the enclosed emitters should be organized in the +/// following way: jit_kernel_emitter { /* entry point, maps registers, creates pools of available +/// registers */ /// 1.S jit_loop_begin_emitter /* Scalar Loop over the outer dimension [START] */ /// 2.S jit_loop_begin_emitter /* inner vector loop [START] */ /// ... /* All the necessary Load/Strore/elementwise emitters */ @@ -31,21 +32,29 @@ namespace aarch64 { class jit_kernel_emitter : public jit_emitter, public jit_container_emitter { public: - jit_kernel_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - - size_t get_inputs_count() const override {return 0;} - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + jit_kernel_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_count() const override { + return 0; + } + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: void validate_arguments(const std::vector& in, const std::vector& out) const override; - void init_body_regs(const std::set& kernel_regs, const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); + void init_body_regs(const std::set& kernel_regs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}); /** - * @brief populates physical registers pools for x86 (both vec and gp). + * @brief populates physical registers pools for x86 (both vec and gp). * Skips stack-related gprs and extra gprs passed as arguments. * @arg gpr_blacklist - set of gp registers that should not be added to register pool * @arg vec_blacklist - set of vec registers should not be added to register pool - */ + */ void init_reg_pools(const std::set& gpr_blacklist, const std::set& vec_blacklist); virtual void init_data_pointers(const std::vector& data_ptr_regs) const = 0; @@ -69,14 +78,15 @@ class jit_kernel_emitter : public jit_emitter, public jit_container_emitter { std::shared_ptr body; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter *emitter); + friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter* emitter); #endif }; class jit_kernel_static_emitter : public jit_kernel_emitter { public: jit_kernel_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; @@ -86,23 +96,24 @@ class jit_kernel_static_emitter : public jit_kernel_emitter { std::vector> data_offsets; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter *emitter); + friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* emitter); #endif }; class jit_kernel_dynamic_emitter : public jit_kernel_emitter { public: jit_kernel_dynamic_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter *emitter); + friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter* emitter); #endif }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp index 2b5b41fb912606..0666505a6d31ab 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp @@ -3,8 +3,9 @@ // #include "jit_loop_emitters.hpp" -#include "jit_kernel_emitter.hpp" + #include "emitters/utils.hpp" +#include "jit_kernel_emitter.hpp" using namespace Xbyak_aarch64; @@ -18,9 +19,11 @@ using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; /* ================== jit_loop_begin_emitter ====================== */ -jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{new Xbyak_aarch64::Label()} { + : jit_emitter(h, isa), + loop_begin_label{new Xbyak_aarch64::Label()} { const auto loop_begin = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_begin, "expects LoopBegin expression"); const auto loop_end = loop_begin->get_loop_end(); @@ -31,15 +34,17 @@ jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_gen in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void jit_loop_begin_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_loop_begin_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty(), "Invalid inputs size: expected 0 got " + std::to_string(in.size())); // Note: the only expected output is work amount register (communicated to jit_loop_end_emitter) OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Invalid outputs size: expected 1 got " + std::to_string(out.size())); OV_CPU_JIT_EMITTER_ASSERT(loop_begin_label != nullptr, "has not inited label!"); } -void jit_loop_begin_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_begin_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } @@ -56,9 +61,11 @@ void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std: /* ================== jit_loop_end_emitter ====================== */ -jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{nullptr} { + : jit_emitter(h, isa), + loop_begin_label{nullptr} { in_out_type_ = emitter_in_out_map::gpr_to_gpr; const auto loop_end = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_end != nullptr, "expected LoopEnd expr"); @@ -79,27 +86,49 @@ jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generat loop_begin_label = loop_begin_emitter->get_begin_label(); } -ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr) { +ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr( + const ov::snippets::lowered::ExpressionPtr& expr) { const auto begin_expr = expr->get_input_port_connectors().back()->get_source().get_expr(); OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(begin_expr->get_node()), "LoopEnd expression must have th last port connector to LoopBegin"); return begin_expr; } -void jit_loop_end_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { -const auto io_size = num_inputs + num_outputs; +void jit_loop_end_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { + const auto io_size = num_inputs + num_outputs; OV_CPU_JIT_EMITTER_ASSERT(out.size() == 0, "Invalid number of out arguments: expected ", 0, " got ", out.size()); - OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, "Invalid number of in arguments: expected ", io_size + 1, " got ", in.size()); - OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, "Invalid is_incremented size: expected ", io_size, " got ", is_incremented.size()); - OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, "Invalid ptr_increments size: expected ", io_size, " got ", ptr_increments.size()); + OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, + "Invalid number of in arguments: expected ", + io_size + 1, + " got ", + in.size()); + OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, + "Invalid is_incremented size: expected ", + io_size, + " got ", + is_incremented.size()); + OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, + "Invalid ptr_increments size: expected ", + io_size, + " got ", + ptr_increments.size()); OV_CPU_JIT_EMITTER_ASSERT(finalization_offsets.size() == io_size, - "Invalid finalization_offsets size: expected: ", io_size, " got ", finalization_offsets.size()); - OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, "Invalid data_sizes size: expected: ", io_size, " got ", data_sizes.size()); + "Invalid finalization_offsets size: expected: ", + io_size, + " got ", + finalization_offsets.size()); + OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, + "Invalid data_sizes size: expected: ", + io_size, + " got ", + data_sizes.size()); OV_CPU_JIT_EMITTER_ASSERT(loop_begin_label != nullptr, "has not inited begin label!"); } -void jit_loop_end_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_end_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } @@ -118,7 +147,7 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v if (ptr_increments[idx] > 0) { h->add_imm(data_reg, data_reg, ptr_increments[idx] * wa_increment * data_sizes[idx], h->X_TMP_0); } else if (ptr_increments[idx] < 0) { - h->sub_imm(data_reg, data_reg, - ptr_increments[idx] * wa_increment * data_sizes[idx], h->X_TMP_0); + h->sub_imm(data_reg, data_reg, -ptr_increments[idx] * wa_increment * data_sizes[idx], h->X_TMP_0); } } h->sub_imm(reg_work_amount, reg_work_amount, wa_increment, h->X_TMP_0); @@ -133,13 +162,13 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v if (finalization_offsets[idx] > 0) { h->add_imm(data_reg, data_reg, finalization_offsets[idx] * data_sizes[idx], h->X_TMP_0); } else if (finalization_offsets[idx] < 0) { - h->sub_imm(data_reg, data_reg, - finalization_offsets[idx] * data_sizes[idx], h->X_TMP_0); + h->sub_imm(data_reg, data_reg, -finalization_offsets[idx] * data_sizes[idx], h->X_TMP_0); } } } /* ============================================================== */ -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp index 6ec87835821df2..c89928353646cd 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp @@ -5,7 +5,6 @@ #pragma once #include "emitters/plugin/aarch64/jit_emitter.hpp" - #include "snippets/op/loop.hpp" namespace ov { @@ -14,20 +13,27 @@ namespace aarch64 { /* ================== jit_loop_begin_emitter ====================== */ -class jit_loop_begin_emitter: public jit_emitter { +class jit_loop_begin_emitter : public jit_emitter { public: - jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, + jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override { return 0; } + size_t get_inputs_count() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; - std::shared_ptr get_begin_label() { return loop_begin_label; } + std::shared_ptr get_begin_label() { + return loop_begin_label; + } protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; std::shared_ptr loop_begin_label; @@ -40,18 +46,23 @@ class jit_loop_begin_emitter: public jit_emitter { /* ================== jit_loop_end_emitter ====================== */ -class jit_loop_end_emitter: public jit_emitter { +class jit_loop_end_emitter : public jit_emitter { public: - jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override { return 0; } + size_t get_inputs_count() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; static ov::snippets::lowered::ExpressionPtr get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr); @@ -70,6 +81,6 @@ class jit_loop_end_emitter: public jit_emitter { /* ============================================================== */ -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp index d19843f395d2f3..9989f3431fb2a8 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_memory_emitters.hpp" + #include "emitters/utils.hpp" using namespace Xbyak_aarch64; @@ -15,15 +16,18 @@ using jit_generator = dnnl::impl::cpu::aarch64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::aarch64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { +jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_emitter(h, isa) { const auto n = expr->get_node(); src_prc = n->get_input_element_type(0); dst_prc = n->get_output_element_type(0); } -jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) { - bool is_supported_precision = one_of(src_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && - src_prc == dst_prc; +jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr) { + bool is_supported_precision = + one_of(src_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && + src_prc == dst_prc; OV_CPU_JIT_EMITTER_ASSERT(is_supported_precision, "Unsupported precision pair."); const auto load = std::dynamic_pointer_cast(expr->get_node()); @@ -34,8 +38,7 @@ jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count, byte_offset)); } -void jit_load_memory_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_load_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -44,7 +47,7 @@ void jit_load_memory_emitter::emit_impl(const std::vector& in, } template -void jit_load_memory_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_load_memory_emitter::emit_isa(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(load_emitter != nullptr, "Load CPU emitter isn't initialized!"); load_emitter->emit_code(in, out, aux_vec_idxs, aux_gpr_idxs); @@ -56,7 +59,8 @@ void jit_load_memory_emitter::emit_data() const { jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) { - OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, "Only support equal input and output types but gets ", + OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, + "Only support equal input and output types but gets ", src_prc.get_type_name(), " and ", dst_prc.get_type_name()); @@ -68,8 +72,7 @@ jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa in_out_type_ = emitter_in_out_map::gpr_to_vec; } -void jit_load_broadcast_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_load_broadcast_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -78,7 +81,7 @@ void jit_load_broadcast_emitter::emit_impl(const std::vector& in, } template -void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_load_broadcast_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in[0]); TReg dst = TReg(out[0]); @@ -86,9 +89,11 @@ void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const s h->uni_ld1rw(dst.s, src, byte_offset); } -jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) { - bool is_supported_precision = one_of(dst_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && - src_prc == dst_prc; +jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr) { + bool is_supported_precision = + one_of(dst_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && + src_prc == dst_prc; OV_CPU_JIT_EMITTER_ASSERT(is_supported_precision, "Unsupported precision pair."); const auto store = ov::as_type_ptr(expr->get_node()); @@ -99,8 +104,7 @@ jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t i store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, byte_offset)); } -void jit_store_memory_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_store_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -109,7 +113,7 @@ void jit_store_memory_emitter::emit_impl(const std::vector& in, } template -void jit_store_memory_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_store_memory_emitter::emit_isa(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(store_emitter != nullptr, "Store CPU emitter isn't initialized!"); store_emitter->emit_code(in, out, aux_vec_idxs, aux_gpr_idxs); @@ -119,6 +123,6 @@ void jit_store_memory_emitter::emit_data() const { store_emitter->emit_data(); } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp index ba0b4e4acfedb4..edb85751f9086d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp @@ -11,11 +11,11 @@ namespace ov { namespace intel_cpu { namespace aarch64 { -class jit_memory_emitter : public jit_emitter { +class jit_memory_emitter : public jit_emitter { public: jit_memory_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); protected: ov::element::Type src_prc; @@ -28,17 +28,18 @@ class jit_memory_emitter : public jit_emitter { class jit_load_memory_emitter : public jit_memory_emitter { public: jit_load_memory_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; void emit_data() const override; private: @@ -48,39 +49,41 @@ class jit_load_memory_emitter : public jit_memory_emitter { class jit_load_broadcast_emitter : public jit_memory_emitter { public: jit_load_broadcast_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; }; -class jit_store_memory_emitter : public jit_memory_emitter { +class jit_store_memory_emitter : public jit_memory_emitter { public: jit_store_memory_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; void emit_data() const override; private: std::unique_ptr store_emitter = nullptr; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp index 6529312ae1095a..69fcc7a92fd259 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_snippets_emitters.hpp" + #include "cpu/aarch64/jit_generator.hpp" #include "cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h" #include "emitters/utils.hpp" @@ -17,7 +18,8 @@ using jit_generator = dnnl::impl::cpu::aarch64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::aarch64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_nop_emitter::jit_nop_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : aarch64::jit_emitter(h, isa) { +jit_nop_emitter::jit_nop_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : aarch64::jit_emitter(h, isa) { in_out_type_ = emitter_in_out_map::gpr_to_gpr; } @@ -29,14 +31,12 @@ jit_broadcast_move_emitter::jit_broadcast_move_emitter(jit_generator* h, cpu_isa n->get_input_element_type(0), " and ", n->get_output_element_type(0)); - OV_CPU_JIT_EMITTER_ASSERT(n->get_input_element_type(0) == ov::element::f32, - "Only supports FP32 precision."); + OV_CPU_JIT_EMITTER_ASSERT(n->get_input_element_type(0) == ov::element::f32, "Only supports FP32 precision."); byte_size = n->get_input_element_type(0).size(); } -void jit_broadcast_move_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_broadcast_move_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -45,42 +45,42 @@ void jit_broadcast_move_emitter::emit_impl(const std::vector& in, } template -void jit_broadcast_move_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_broadcast_move_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in[0]); TReg dst = TReg(out[0]); switch (byte_size) { - case 4: - h->dup(dst.s, src.s[0]); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported data size ", byte_size); + case 4: + h->dup(dst.s, src.s[0]); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported data size ", byte_size); } } -jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { +jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_emitter(h, isa) { const auto n = expr->get_node(); const auto& precision = n->get_output_element_type(0); switch (precision) { - case element::i32: { - value = ov::as_type_ptr(n)->cast_vector()[0]; - break; - } - case element::f32: { - value = dnnl::impl::float2int(ov::as_type_ptr(n)->cast_vector()[0]); - break; - } - default: { - OV_CPU_JIT_EMITTER_THROW("Doesn't support precision ", precision); - } + case element::i32: { + value = ov::as_type_ptr(n)->cast_vector()[0]; + break; + } + case element::f32: { + value = dnnl::impl::float2int(ov::as_type_ptr(n)->cast_vector()[0]); + break; + } + default: { + OV_CPU_JIT_EMITTER_THROW("Doesn't support precision ", precision); + } } push_arg_entry_of("scalar", value, true); prepare_table(); } -void jit_scalar_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_scalar_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -89,7 +89,7 @@ void jit_scalar_emitter::emit_impl(const std::vector& in, } template -void jit_scalar_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_scalar_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg dst = TReg(out[0]); AdrImm src = table_val("scalar"); @@ -97,6 +97,6 @@ void jit_scalar_emitter::emit_isa(const std::vector &in, const std::vect h->uni_ld1rw(dst.s, src.getXn(), src.getImm()); } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp index 0f05024ed12168..13f9aa70fb2c8e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp @@ -16,11 +16,12 @@ class jit_nop_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 0;} + size_t get_inputs_count() const override { + return 0; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override {} + void emit_impl(const std::vector& in, const std::vector& out) const override {} }; class jit_broadcast_move_emitter : public jit_emitter { @@ -29,14 +30,15 @@ class jit_broadcast_move_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; private: size_t byte_size = 0lu; @@ -48,22 +50,25 @@ class jit_scalar_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 0;} + size_t get_inputs_count() const override { + return 0; + } protected: - size_t get_aux_gprs_count() const override {return 1;} + size_t get_aux_gprs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; private: int32_t value; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp index 79e8dcafb218f6..cfe03d21eac19e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp @@ -4,34 +4,38 @@ #pragma once -#include "snippets/kernel_executor_table.hpp" #include "cache/multi_cache.h" +#include "snippets/kernel_executor_table.hpp" namespace ov { namespace intel_cpu { -template +template class CPUKernelExecutor : public snippets::KernelExecutor { public: - CPUKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, Conf c) : - snippets::KernelExecutor(std::move(c)), m_kernel_cache(std::move(kernel_cache)) {} + CPUKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, Conf c) + : snippets::KernelExecutor(std::move(c)), + m_kernel_cache(std::move(kernel_cache)) {} - void update_kernel(const Conf& config, std::shared_ptr& kernel) const override final { // NOLINT + void update_kernel(const Conf& config, std::shared_ptr& kernel) const override final { // NOLINT const auto& cache = m_kernel_cache.lock(); OPENVINO_ASSERT(cache, "Invalid kernel cache pointer in CPUKernelExecutor::update_kernel()"); - const auto& lookup_result = cache->getOrCreate(Key(config), - [this](const Key& k) { - return compile_kernel(k.config); - }); + const auto& lookup_result = cache->getOrCreate(Key(config), [this](const Key& k) { + return compile_kernel(k.config); + }); kernel = lookup_result.first; } protected: struct Key { explicit Key(Conf c) : config{std::move(c)} {} - const Conf config; - size_t hash() const { return config.hash(); } - bool operator==(const Key& rhs) const { return config == rhs.config; } + const Conf config; + size_t hash() const { + return config.hash(); + } + bool operator==(const Key& rhs) const { + return config == rhs.config; + } }; /** Compile kernel managed by KernelExecutor instance. Will be called only if Kernel is not found in the cache */ virtual std::shared_ptr compile_kernel(const Conf& c) const = 0; @@ -39,5 +43,5 @@ class CPUKernelExecutor : public snippets::KernelExecutor { ov::intel_cpu::MultiCacheWeakPtr m_kernel_cache; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index b2758735b2d27a..65741d7031d289 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -8,8 +8,8 @@ #include "snippets/utils/utils.hpp" #ifndef OPENVINO_ARCH_ARM64 -#include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" -#include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" +# include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" +# include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif namespace ov { namespace intel_cpu { @@ -21,7 +21,8 @@ const size_t CPURuntimeConfigurator::rank6D = 6; std::string CPURuntimeConfig::to_string() const { std::stringstream out; out << RuntimeConfig::to_string(); - out << "Loop Parameters:" << "\n"; + out << "Loop Parameters:" + << "\n"; for (size_t i = 0; i < loop_args.size(); ++i) { const auto& loop = loop_args[i]; out << "\t[" << i << "] WA: " << loop.m_work_amount << "\n"; @@ -38,8 +39,8 @@ std::string CPURuntimeConfig::to_string() const { } #endif -CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared()) { -} +CPURuntimeConfigurator::CPURuntimeConfigurator() + : ov::snippets::RuntimeConfigurator(std::make_shared()) {} void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { RuntimeConfigurator::initialization(linear_ir); @@ -78,12 +79,14 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::Linea const auto& data_sizes = loop_info->get_data_sizes(); auto& loop_arg = cpu_config->loop_args[idx]; - loop_arg = jit_snippets_call_args::loop_args_t(loop_info->get_work_amount(), loop_info->get_ptr_increments(), loop_info->get_finalization_offsets()); + loop_arg = jit_snippets_call_args::loop_args_t(loop_info->get_work_amount(), + loop_info->get_ptr_increments(), + loop_info->get_finalization_offsets()); for (int64_t i = 0; i < loop_arg.m_num_data_ptrs; ++i) { loop_arg.m_ptr_increments[i] *= (increment * data_sizes[i]); loop_arg.m_finalization_offsets[i] *= data_sizes[i]; } } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 42ce35a3c66c2b..1706670ce870d1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -34,6 +34,7 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { * @param linear_ir LinearIR */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + protected: void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override; @@ -43,5 +44,5 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { static const size_t rank6D; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp index 6f78c43fd54797..ceee57f3c0cd28 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp @@ -3,15 +3,18 @@ // #include "jit_container_emitter.hpp" + #include "emitters/utils.hpp" #include "utils/general_utils.h" namespace ov { namespace intel_cpu { -void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, +void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, + mapping_info& vec_map_pool, snippets::lowered::LinearIR::container& expressions) const { - OV_CPU_JIT_EMITTER_ASSERT(!expressions.empty(), "Cannot map registers when there is no allocated_emitters provided"); + OV_CPU_JIT_EMITTER_ASSERT(!expressions.empty(), + "Cannot map registers when there is no allocated_emitters provided"); auto map_regs = [&](const std::vector& abstract_regs) { std::vector physical_regs = abstract_regs; @@ -19,13 +22,16 @@ void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, m const auto& abstract_reg = abstract_regs[i]; const auto& type = abstract_reg.type; const auto& abstract = abstract_reg.idx; - OV_CPU_JIT_EMITTER_ASSERT(one_of(type, snippets::RegType::gpr, snippets::RegType::vec), "Incorrect reg type detected!"); + OV_CPU_JIT_EMITTER_ASSERT(one_of(type, snippets::RegType::gpr, snippets::RegType::vec), + "Incorrect reg type detected!"); auto& mapping = type == snippets::RegType::gpr ? gpr_map_pool : vec_map_pool; auto& abstract_to_physical = mapping.first; auto& regs_pool = mapping.second; auto& physical = physical_regs[i]; if (abstract_to_physical.count(abstract) == 0) { - OV_CPU_JIT_EMITTER_ASSERT(!regs_pool.empty(), "Cannot map registers for jit_container_emitter: not enough regs in the pool"); + OV_CPU_JIT_EMITTER_ASSERT( + !regs_pool.empty(), + "Cannot map registers for jit_container_emitter: not enough regs in the pool"); physical.idx = regs_pool.back(); regs_pool.pop_back(); abstract_to_physical[abstract] = physical.idx; @@ -48,5 +54,5 @@ void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, m } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp index 2325c6ef1a2eb3..7737e7e1150926 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp @@ -20,8 +20,10 @@ class jit_container_emitter { protected: // maps gpr and vec abstract registers to physical ones. - void map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, snippets::lowered::LinearIR::container& expressions) const; + void map_abstract_registers(mapping_info& gpr_map_pool, + mapping_info& vec_map_pool, + snippets::lowered::LinearIR::container& expressions) const; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp index 48f98c2ffb2450..20e19bcba7e4f4 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp @@ -3,10 +3,11 @@ // #include "jit_snippets_call_args.hpp" -#include "emitters/utils.hpp" #include +#include "emitters/utils.hpp" + namespace ov { namespace intel_cpu { @@ -21,16 +22,19 @@ void jit_snippets_call_args::register_loops(const std::vector& loop std::copy(loops.begin(), loops.end(), loop_args); } -jit_snippets_call_args::loop_args_t::loop_args_t(int64_t work_amount, const std::vector& ptr_increments, +jit_snippets_call_args::loop_args_t::loop_args_t(int64_t work_amount, + const std::vector& ptr_increments, const std::vector& finalization_offsets) : m_work_amount(work_amount) { - OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == finalization_offsets.size(), "Inconsistent sizes of ptr_increments and finalization_offsets"); + OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == finalization_offsets.size(), + "Inconsistent sizes of ptr_increments and finalization_offsets"); m_num_data_ptrs = static_cast(ptr_increments.size()); init_pointers_and_copy_data(m_num_data_ptrs, ptr_increments.data(), finalization_offsets.data()); } jit_snippets_call_args::loop_args_t::loop_args_t(const loop_args_t& other) - : m_work_amount(other.m_work_amount), m_num_data_ptrs(other.m_num_data_ptrs) { + : m_work_amount(other.m_work_amount), + m_num_data_ptrs(other.m_num_data_ptrs) { init_pointers_and_copy_data(m_num_data_ptrs, other.m_ptr_increments, other.m_finalization_offsets); } @@ -44,7 +48,8 @@ jit_snippets_call_args::loop_args_t& jit_snippets_call_args::loop_args_t::operat return *this; } -void jit_snippets_call_args::loop_args_t::init_pointers_and_copy_data(const int64_t num_elements, const int64_t* ptr_increments, +void jit_snippets_call_args::loop_args_t::init_pointers_and_copy_data(const int64_t num_elements, + const int64_t* ptr_increments, const int64_t* finalization_offsets) { const size_t chunk_size = num_elements * sizeof(int64_t); m_ptr_increments = new int64_t[num_elements]; @@ -60,5 +65,5 @@ void swap(jit_snippets_call_args::loop_args_t& first, jit_snippets_call_args::lo std::swap(first.m_finalization_offsets, second.m_finalization_offsets); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp index 027655d493784d..eb74190dd71676 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp @@ -4,9 +4,9 @@ #pragma once -#include -#include #include +#include +#include #include "dnnl_types.h" #include "openvino/core/visibility.hpp" @@ -15,12 +15,12 @@ namespace ov { namespace intel_cpu { #if defined(OPENVINO_ARCH_ARM64) -#define SNIPPETS_MAX_DATA_PTR_COUNT 23 +# define SNIPPETS_MAX_DATA_PTR_COUNT 23 #else -#define SNIPPETS_MAX_DATA_PTR_COUNT 11 +# define SNIPPETS_MAX_DATA_PTR_COUNT 11 #endif -#define GET_OFF(field) offsetof(jit_snippets_call_args, field) +#define GET_OFF(field) offsetof(jit_snippets_call_args, field) #define GET_OFF_LOOP_ARGS(field) offsetof(jit_snippets_call_args::loop_args_t, field) struct amx_tile_config_t { @@ -37,9 +37,9 @@ struct jit_snippets_call_args { void register_loops(const std::vector& loops); - const void *src_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; - void *dst_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; - void *buffer_scratchpad_ptr = nullptr; + const void* src_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; + void* dst_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; + void* buffer_scratchpad_ptr = nullptr; // Note: Ideally loop_args must be private, since we manage this pointer manually. // However, standard-layout class definition (to use offset_of) requires the same access specifier @@ -51,14 +51,18 @@ struct jit_snippets_call_args { struct jit_snippets_call_args::loop_args_t { loop_args_t() = default; - loop_args_t(int64_t work_amount, const std::vector& ptr_increments, const std::vector& finalization_offsets); + loop_args_t(int64_t work_amount, + const std::vector& ptr_increments, + const std::vector& finalization_offsets); loop_args_t(const loop_args_t& other); ~loop_args_t(); loop_args_t& operator=(loop_args_t other); friend void swap(loop_args_t& first, loop_args_t& second); - void init_pointers_and_copy_data(const int64_t num_elements, const int64_t* ptr_increments, const int64_t* finalization_offsets); + void init_pointers_and_copy_data(const int64_t num_elements, + const int64_t* ptr_increments, + const int64_t* finalization_offsets); int64_t m_work_amount = 0; int64_t m_num_data_ptrs = 0; @@ -71,5 +75,5 @@ struct jit_snippets_compile_args { std::vector exec_domain = {}; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp index b7c51539861ff8..e4c3c40e1d8120 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp @@ -3,7 +3,7 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#include "debug_caps_config.hpp" +# include "debug_caps_config.hpp" namespace ov { namespace intel_cpu { @@ -20,7 +20,7 @@ void SnippetsDebugCapsConfig::readProperties() { enable_segfault_detector = readEnv("OV_CPU_SNIPPETS_SEGFAULT_DETECTOR") ? true : false; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp index 14dcae0ddf0c69..8f01e85063f5e9 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp @@ -3,10 +3,10 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#pragma once +# pragma once -#include -#include +# include +# include namespace ov { namespace intel_cpu { @@ -23,7 +23,7 @@ class SnippetsDebugCapsConfig { void readProperties(); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index c210782db8f91c..39e384837856a1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -4,63 +4,61 @@ #include "cpu_generator.hpp" -#include "snippets/snippets_isa.hpp" -#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include +#include "emitters/plugin/x64/jit_conversion_emitters.hpp" +#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" +#include "emitters/plugin/x64/jit_dnnl_ext_emitters.hpp" +#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/cpu_runtime_configurator.hpp" #include "emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp" #include "emitters/snippets/x64/jit_brgemm_emitter.hpp" -#include "emitters/snippets/x64/jit_memory_emitters.hpp" +#include "emitters/snippets/x64/jit_fill_emitter.hpp" +#include "emitters/snippets/x64/jit_horizon_emitter.hpp" #include "emitters/snippets/x64/jit_kernel_emitter.hpp" #include "emitters/snippets/x64/jit_loop_emitters.hpp" +#include "emitters/snippets/x64/jit_memory_emitters.hpp" #include "emitters/snippets/x64/jit_snippets_emitters.hpp" -#include "emitters/snippets/x64/jit_fill_emitter.hpp" -#include "emitters/snippets/x64/jit_horizon_emitter.hpp" -#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" -#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" -#include "emitters/plugin/x64/jit_dnnl_ext_emitters.hpp" -#include "emitters/plugin/x64/jit_conversion_emitters.hpp" - -#include "transformations/snippets/x64/op/load_convert.hpp" -#include "transformations/snippets/x64/op/store_convert.hpp" +#include "snippets/snippets_isa.hpp" +#include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/snippets/common/op/fused_mul_add.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/load_convert.hpp" #include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" -#include "transformations/cpu_opset/common/op/swish_cpu.hpp" +#include "transformations/snippets/x64/op/store_convert.hpp" #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp" -#include -#include "emitters/snippets/cpu_kernel_executor_table.hpp" - #ifdef SNIPPETS_DEBUG_CAPS -#include "emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp" -#include "emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp" -#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" -#include "emitters/snippets/x64/jit_debug_emitter.hpp" -#include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" -#include "emitters/snippets/x64/verbose.hpp" +# include "emitters/snippets/x64/jit_debug_emitter.hpp" +# include "emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp" +# include "emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp" +# include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" +# include "emitters/snippets/x64/verbose.hpp" +# include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" #endif #ifdef SNIPPETS_LIBXSMM_TPP -#include "transformations/tpp/x64/op/brgemm.hpp" -#include "transformations/tpp/x64/op/eltwise.hpp" -#include "transformations/tpp/x64/op/reduce.hpp" -#include "transformations/tpp/x64/op/modifiers.hpp" -#include "transformations/tpp/x64/op/scalar.hpp" -#include "transformations/tpp/x64/op/equation.hpp" -#include "emitters/tpp/x64/jit_eltwise_emitters.hpp" -#include "emitters/tpp/x64/jit_brgemm_emitter.hpp" -#include "emitters/tpp/x64/jit_scalar_emitter.hpp" -#include "emitters/tpp/x64/jit_equation_emitter.hpp" -#include "emitters/tpp/x64/jit_debug_emitter.hpp" +# include "emitters/tpp/x64/jit_brgemm_emitter.hpp" +# include "emitters/tpp/x64/jit_debug_emitter.hpp" +# include "emitters/tpp/x64/jit_eltwise_emitters.hpp" +# include "emitters/tpp/x64/jit_equation_emitter.hpp" +# include "emitters/tpp/x64/jit_scalar_emitter.hpp" +# include "transformations/tpp/x64/op/brgemm.hpp" +# include "transformations/tpp/x64/op/eltwise.hpp" +# include "transformations/tpp/x64/op/equation.hpp" +# include "transformations/tpp/x64/op/modifiers.hpp" +# include "transformations/tpp/x64/op/reduce.hpp" +# include "transformations/tpp/x64/op/scalar.hpp" // Note: for reference implementations -#include +# include #endif namespace ov { #ifdef SNIPPETS_DEBUG_CAPS -static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) { +static bool is_load_emitter(const intel_cpu::jit_emitter* emitter) { bool ret = false; if (dynamic_cast(emitter) || dynamic_cast(emitter)) { @@ -69,7 +67,7 @@ static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) { return ret; } -static bool is_store_emitter(const intel_cpu::jit_emitter *emitter) { +static bool is_store_emitter(const intel_cpu::jit_emitter* emitter) { bool ret = false; if (dynamic_cast(emitter)) { return true; @@ -77,72 +75,82 @@ static bool is_store_emitter(const intel_cpu::jit_emitter *emitter) { return ret; } -static bool is_segfault_detector_emitter(const intel_cpu::jit_emitter *emitter) { +static bool is_segfault_detector_emitter(const intel_cpu::jit_emitter* emitter) { // default active for typical tensor memory access emitters bool ret = false; - ret = is_load_emitter(emitter) || - is_store_emitter(emitter) || - dynamic_cast(emitter) || - dynamic_cast(emitter) || - dynamic_cast(emitter); + ret = is_load_emitter(emitter) || is_store_emitter(emitter) || + dynamic_cast(emitter) || + dynamic_cast(emitter) || + dynamic_cast(emitter); return ret; // use below code to active all emitters for extend usage // return !dynamic_cast(emitter); } -#define CREATE_SNIPPETS_EMITTER(e_type, ...) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - auto emitter = std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ - if (debug_config.enable_segfault_detector && is_segfault_detector_emitter(emitter.get())) { \ - auto segfault_emitter = std::make_shared(h.get(), isa, emitter.get(), \ - is_load_emitter(emitter.get()), is_store_emitter(emitter.get()), expr->get_node()->get_friendly_name()); \ - return std::make_shared(emitter, segfault_emitter, jit_debug_emitter::EmissionLocation::preamble); \ - } else { \ - return emitter; \ - } \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +# define CREATE_SNIPPETS_EMITTER(e_type, ...) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + auto emitter = std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ + if (debug_config.enable_segfault_detector && is_segfault_detector_emitter(emitter.get())) { \ + auto segfault_emitter = \ + std::make_shared(h.get(), \ + isa, \ + emitter.get(), \ + is_load_emitter(emitter.get()), \ + is_store_emitter(emitter.get()), \ + expr->get_node()->get_friendly_name()); \ + return std::make_shared(emitter, \ + segfault_emitter, \ + jit_debug_emitter::EmissionLocation::preamble); \ + } else { \ + return emitter; \ + } \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } #else -#define CREATE_SNIPPETS_EMITTER(e_type, ...) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +# define CREATE_SNIPPETS_EMITTER(e_type, ...) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } #endif -#define CREATE_DEBUG_TPP_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(expr, std::make_shared(h.get(), isa, expr)); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} - +#define CREATE_DEBUG_TPP_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(expr, std::make_shared(h.get(), isa, expr)); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_CPU_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr->get_node()); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +#define CREATE_CPU_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr->get_node()); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_UNDEFINED_EMITTER(supported_precisions) { \ - [](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return nullptr; \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return supported_precisions; \ - } \ -} +#define CREATE_UNDEFINED_EMITTER(supported_precisions) \ + { \ + [](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return nullptr; \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return supported_precisions; \ + } \ + } class jit_snippet : public dnnl::impl::cpu::x64::jit_generator { public: @@ -157,30 +165,43 @@ class jit_snippet : public dnnl::impl::cpu::x64::jit_generator { intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) - : TargetMachine(std::make_shared()), h(new jit_snippet()), isa(host_isa), compiled_kernel_cache(std::move(cache)) { + : TargetMachine(std::make_shared()), + h(new jit_snippet()), + isa(host_isa), + compiled_kernel_cache(std::move(cache)) { // data movement jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Buffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); - jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::RankNormalization::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[snippets::op::LoadReshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter); - jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); + jitters[snippets::op::LoadReshape::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); + jitters[snippets::op::BroadcastLoad::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter); + jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); + jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::Store::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); - jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); - jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); + jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); + jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); jitters[snippets::op::Scalar::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_scalar_emitter); - jitters[snippets::op::BroadcastMove::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_broadcast_move_emitter); + jitters[snippets::op::BroadcastMove::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_broadcast_move_emitter); - jitters[snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_convert_truncation_emitter); - jitters[snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_convert_saturation_emitter); + jitters[snippets::op::ConvertTruncation::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_convert_truncation_emitter); + jitters[snippets::op::ConvertSaturation::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_convert_saturation_emitter); // ternary jitters[op::v1::Select::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_select_emitter); @@ -203,10 +224,12 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[op::v1::Mod::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_mod_emitter); jitters[op::v1::Multiply::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_multiply_emitter); jitters[op::v1::NotEqual::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_not_equal_emitter); - jitters[snippets::op::PowerStatic::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_power_static_emitter); + jitters[snippets::op::PowerStatic::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_power_static_emitter); jitters[op::v1::Power::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_power_dynamic_emitter); jitters[op::v0::PRelu::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_prelu_emitter); - jitters[op::v0::SquaredDifference::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_squared_difference_emitter); + jitters[op::v0::SquaredDifference::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_squared_difference_emitter); jitters[op::v1::Subtract::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_subtract_emitter); jitters[op::v0::Xor::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_logical_xor_emitter); @@ -235,25 +258,35 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::HorizonMax::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_horizon_emitter); jitters[snippets::op::HorizonSum::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_horizon_emitter); - jitters[snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_static_emitter); - jitters[snippets::op::KernelDynamic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_dynamic_emitter); - jitters[snippets::op::LoopBegin::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_loop_begin_emitter); + jitters[snippets::op::KernelStatic::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_static_emitter); + jitters[snippets::op::KernelDynamic::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_dynamic_emitter); + jitters[snippets::op::LoopBegin::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_loop_begin_emitter); jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_loop_end_emitter); - // Note: jit_brgemm_emitter and jit_brgemm_copy_b_emitter support runtime recompilation, so their constructor takes additional arguments - jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_emitter, - configurator->get_kernel_executor_table(), - compiled_kernel_cache); - jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_copy_b_emitter, - configurator->get_kernel_executor_table(), - compiled_kernel_cache); + // Note: jit_brgemm_emitter and jit_brgemm_copy_b_emitter support runtime recompilation, so their constructor takes + // additional arguments + jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_emitter, + configurator->get_kernel_executor_table(), + compiled_kernel_cache); + jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_copy_b_emitter, + configurator->get_kernel_executor_table(), + compiled_kernel_cache); jitters[snippets::op::ReduceMax::get_type_info_static()] = CREATE_UNDEFINED_EMITTER({{ov::element::f32}}); jitters[snippets::op::ReduceSum::get_type_info_static()] = CREATE_UNDEFINED_EMITTER({{ov::element::f32}}); #ifdef SNIPPETS_DEBUG_CAPS - jitters[snippets::op::PerfCountBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter); - jitters[snippets::op::PerfCountEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter); - jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter); - jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter); + jitters[snippets::op::PerfCountBegin::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter); + jitters[snippets::op::PerfCountEnd::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter); + jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter); + jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter); #endif #ifdef SNIPPETS_LIBXSMM_TPP @@ -267,8 +300,8 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho // Note: you can register Debug emitter for Unary/Binary operations as shown below: // jitters[intel_cpu::tpp::op::Add::get_type_info_static()] = CREATE_DEBUG_TPP_EMITTER(UnaryEltwiseTppEmitter); // - // Note: you can register Reference emitter for Unary operations using std::function or lambda function as shown below: - // jitters[intel_cpu::tpp::op::Exp::get_type_info_static()] = + // Note: you can register Reference emitter for Unary operations using std::function or lambda function as shown + // below: jitters[intel_cpu::tpp::op::Exp::get_type_info_static()] = // CREATE_SNIPPETS_EMITTER(ReferenceUnaryEltwiseTppEmitter, static_cast(std::exp)); // jitters[intel_cpu::tpp::op::Reciprocal::get_type_info_static()] = // CREATE_SNIPPETS_EMITTER(ReferenceUnaryEltwiseTppEmitter, [](float x){ return 1.f/x; }); @@ -292,10 +325,14 @@ std::shared_ptr intel_cpu::CPUTargetMachine::clone() co size_t intel_cpu::CPUTargetMachine::get_lanes() const { switch (isa) { - case dnnl::impl::cpu::x64::avx2 : return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); - case dnnl::impl::cpu::x64::sse41 : return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); - case dnnl::impl::cpu::x64::avx512_core : return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); - default : OPENVINO_THROW("unknown isa ", isa); + case dnnl::impl::cpu::x64::avx2: + return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); + case dnnl::impl::cpu::x64::sse41: + return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); + case dnnl::impl::cpu::x64::avx512_core: + return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); + default: + OPENVINO_THROW("unknown isa ", isa); } } @@ -315,13 +352,15 @@ snippets::CompiledSnippetPtr intel_cpu::CPUTargetMachine::get_snippet() { if (h->create_kernel() != dnnl::impl::status::success) { OPENVINO_THROW("Failed to create jit_kernel in get_snippet()"); } - const auto& result = std::make_shared(std::unique_ptr(h.release())); + const auto& result = + std::make_shared(std::unique_ptr(h.release())); // Note that we reset all the generated code, since it was copied into CompiledSnippetCPU h.reset(new jit_snippet()); return result; } -intel_cpu::CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) : h_compiled(std::move(h)) { +intel_cpu::CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) + : h_compiled(std::move(h)) { OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); } @@ -337,15 +376,14 @@ bool intel_cpu::CompiledSnippetCPU::empty() const { return get_code_size() == 0; } -intel_cpu::CPUGenerator::CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa_, ov::intel_cpu::MultiCacheWeakPtr cache) : - Generator(std::make_shared(isa_, std::move(cache))) { -} -intel_cpu::CPUGenerator::CPUGenerator(const std::shared_ptr& target) : Generator(target) { -} +intel_cpu::CPUGenerator::CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa_, ov::intel_cpu::MultiCacheWeakPtr cache) + : Generator(std::make_shared(isa_, std::move(cache))) {} +intel_cpu::CPUGenerator::CPUGenerator(const std::shared_ptr& target) : Generator(target) {} std::shared_ptr intel_cpu::CPUGenerator::clone() const { const auto& cpu_target_machine = std::dynamic_pointer_cast(target->clone()); - OPENVINO_ASSERT(cpu_target_machine, "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); + OPENVINO_ASSERT(cpu_target_machine, + "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); return std::make_shared(cpu_target_machine); } @@ -358,12 +396,11 @@ ov::snippets::RegType intel_cpu::CPUGenerator::get_specific_op_out_reg_type(cons #endif std::dynamic_pointer_cast(op)) return ov::snippets::RegType::gpr; - else if ( - std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op)) + else if (std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) return ov::snippets::RegType::vec; else - return ov::snippets::RegType::undefined; + return ov::snippets::RegType::undefined; } bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr& e) const { @@ -383,4 +420,4 @@ bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr h_compiled; + public: const uint8_t* get_code() const override; size_t get_code_size() const override; @@ -31,8 +30,7 @@ class CompiledSnippetCPU : public snippets::CompiledSnippet { class CPUTargetMachine : public snippets::TargetMachine { public: - explicit CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::intel_cpu::MultiCacheWeakPtr); + explicit CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr); std::shared_ptr clone() const override; bool is_supported() const override; snippets::CompiledSnippetPtr get_snippet() override; @@ -60,5 +58,5 @@ class CPUGenerator : public snippets::Generator { bool uses_precompiled_kernel(const std::shared_ptr& emitter) const override; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp index e68ab224407c7b..6df658d8d72d0c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp @@ -4,18 +4,15 @@ #include "jit_brgemm_copy_b_emitter.hpp" +#include +#include + #include "emitters/plugin/x64/utils.hpp" #include "emitters/snippets/x64/utils.hpp" - -#include "snippets/utils/utils.hpp" #include "snippets/lowered/expression.hpp" - +#include "snippets/utils/utils.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" -#include -#include - - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -34,7 +31,9 @@ bool get_is_transposed(const ov::snippets::lowered::ExpressionPtr& expr) { } } // namespace -jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, +jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache) : jit_emitter(h, isa) { @@ -48,29 +47,29 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(0)->get_shape()), "Jit emitter is called when the shapes are unknown"); - const auto& in_subtensor = get_projected_subtensor(expr->get_input_port(0)); - const auto K_blk = *++in_subtensor.rbegin(); - const auto& src_prc = brgemm_repack->get_src_element_type(); const auto& wei_prc = brgemm_repack->get_input_element_type(0); const auto wei_N_blk = brgemm_utils::repacking::compute_inner_n_block(wei_prc); const auto is_transposed = get_is_transposed(expr); - const auto brgemm_type = get_brgemm_type(src_prc, K_blk, is_transposed); + const auto brgemm_type = get_brgemm_type(src_prc, is_transposed); const auto primitive_isa = brgemm_utils::get_primitive_isa(src_prc, with_amx(brgemm_type)); m_with_comp = with_compensations(brgemm_type); BrgemmCopyBKernelConfig kernel_config(src_prc, wei_prc, primitive_isa, m_with_comp, is_transposed, wei_N_blk); - m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + m_kernel_executor = + kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); m_memory_offsets = {brgemm_repack->get_offset_in(), brgemm_repack->get_offset_out()}; - m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), utils::get_buffer_cluster_id(expr->get_output_port(0))}; + m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), + utils::get_buffer_cluster_id(expr->get_output_port(0))}; if (m_with_comp) { m_memory_offsets.push_back(brgemm_repack->get_offset_compensations()); m_buffer_ids.push_back(utils::get_buffer_cluster_id(expr->get_output_port(1))); } } -void jit_brgemm_copy_b_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_brgemm_copy_b_emitter::validate_arguments(const std::vector& in, + const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.size() == 1, "expects 1 input"); OV_CPU_JIT_EMITTER_ASSERT((m_with_comp && out.size() == 2) || (!m_with_comp && out.size() == 1), "expects 2 outputs if there are compensations"); @@ -90,14 +89,20 @@ void jit_brgemm_copy_b_emitter::emit_impl(const std::vector& in, const s // Reserve memory on the stack h->sub(h->rsp, reserved_stack_size); - const bool is_dynamic_case = std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); + const bool is_dynamic_case = + std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); Xbyak::Reg64 aux_reg = is_dynamic_case ? ov::intel_cpu::utils::get_aux_gpr(mem_ptrs_idxs) : Xbyak::Reg64(); - const std::vector args_offsets {GET_OFF_BRGEMM_COPY_B_ARGS(src), GET_OFF_BRGEMM_COPY_B_ARGS(tr_src), GET_OFF_BRGEMM_COPY_B_ARGS(compensation_ptr)}; + const std::vector args_offsets{GET_OFF_BRGEMM_COPY_B_ARGS(src), + GET_OFF_BRGEMM_COPY_B_ARGS(tr_src), + GET_OFF_BRGEMM_COPY_B_ARGS(compensation_ptr)}; const auto& mem_ptrs = ov::intel_cpu::utils::transform_idxs_to_regs(mem_ptrs_idxs); for (size_t i = 0; i < mem_ptrs.size(); i++) { if (ov::snippets::utils::is_dynamic_value(m_memory_offsets[i])) - utils::push_ptr_with_runtime_offset_on_stack(h, args_offsets[i], mem_ptrs[i], aux_reg, + utils::push_ptr_with_runtime_offset_on_stack(h, + args_offsets[i], + mem_ptrs[i], + aux_reg, GET_OFF(buffer_offsets) + m_buffer_ids[i] * sizeof(size_t)); else utils::push_ptr_with_static_offset_on_stack(h, args_offsets[i], mem_ptrs[i], m_memory_offsets[i]); @@ -119,5 +124,5 @@ void jit_brgemm_copy_b_emitter::emit_impl(const std::vector& in, const s spill.postamble(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp index ef53efe6081217..d937e646b603da 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp @@ -5,38 +5,39 @@ #pragma once #include "emitters/plugin/x64/jit_emitter.hpp" - #include "kernel_executors/brgemm_copy_b.hpp" - namespace ov { namespace intel_cpu { class jit_brgemm_copy_b_emitter : public jit_emitter { public: - jit_brgemm_copy_b_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_brgemm_copy_b_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache); - - size_t get_inputs_num() const override {return 1;} - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { - return {{element::i8}, {element::bf16}, {element::f32}}; + size_t get_inputs_num() const override { + return 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { + return {{element::i8}, {element::bf16}, {element::f16}, {element::f32}}; } private: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; std::vector m_memory_offsets{}; std::vector m_buffer_ids{}; - std::shared_ptr m_kernel_executor {nullptr}; - bool m_with_comp {false}; + std::shared_ptr m_kernel_executor{nullptr}; + bool m_with_comp{false}; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter *emitter); + friend std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter* emitter); #endif }; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index 057a3687ab8d16..8d343cec908732 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -4,11 +4,13 @@ #include "jit_brgemm_emitter.hpp" -#include "transformations/snippets/x64/op/brgemm_cpu.hpp" -#include "snippets/utils/utils.hpp" #include "emitters/plugin/x64/utils.hpp" -#include "utils.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_amx.hpp" +#include "snippets/utils/utils.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" +#include "utils.hpp" using namespace Xbyak; using namespace dnnl::impl; @@ -17,30 +19,40 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, +jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, + cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, - const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache) : - jit_emitter(h, isa) { + const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache) + : jit_emitter(h, isa) { in_out_type_ = emitter_in_out_map::gpr_to_gpr; const auto& brgemm_node = as_type_ptr(expr->get_node()); const auto& brg0Prc = brgemm_node->get_input_element_type(0); const auto& brg1Prc = brgemm_node->get_input_element_type(1); const auto brgemm_type = brgemm_node->get_type(); - BrgemmKernelConfig kernel_config(brg0Prc, brg1Prc, with_amx(brgemm_type), with_compensations(brgemm_type), - brgemm_utils::get_primitive_isa(brg0Prc, with_amx(brgemm_type))); - m_kernel_executor = kernel_table->register_kernel(expr, - compiled_kernel_cache, - kernel_config); + m_is_with_amx = brgemm_utils::with_amx(brgemm_type); + if (m_is_with_amx) { + BrgemmAMXKernelConfig kernel_config(brg0Prc, brg1Prc, brgemm_utils::get_primitive_isa(brg0Prc, true)); + m_kernel_executor = + kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + } else { + BrgemmKernelConfig kernel_config(brg0Prc, + brg1Prc, + with_compensations(brgemm_type), + brgemm_utils::get_primitive_isa(brg0Prc, false)); + m_kernel_executor = + kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + } // Note: even if the Brgemm node is dynamic, the first shapeInfer and RuntimeConfigurator::update() // are performed before the BrgemmKernelExecutor registration. So we have to trigger update() manually // for both static and the 1st dynamic shapes. OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(0)->get_shape()) && - !snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(1)->get_shape()), + !snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(1)->get_shape()), "Jit emitter is called when the shapes are unknown"); m_memory_offsets = {brgemm_node->get_offset_a(), brgemm_node->get_offset_b(), brgemm_node->get_offset_c()}; - m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), utils::get_buffer_cluster_id(expr->get_input_port(1)), + m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), + utils::get_buffer_cluster_id(expr->get_input_port(1)), utils::get_buffer_cluster_id(expr->get_output_port(0))}; if (with_scratchpad(brgemm_type)) { m_memory_offsets.push_back(brgemm_node->get_offset_scratch()); @@ -48,7 +60,8 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, } } -std::set> jit_brgemm_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_brgemm_emitter::get_supported_precisions( + const std::shared_ptr& node) { const auto brgemm = as_type_ptr(node); OV_CPU_JIT_EMITTER_ASSERT(brgemm, "get_supported_precisions() expects BrgemmCPU node"); using brgemm_utils::BRGEMM_TYPE; @@ -66,12 +79,13 @@ std::set> jit_brgemm_emitter::get_supported_precision } else if (brgemm->get_type() == BRGEMM_TYPE::WITH_AMX) { return {{element::i8, element::i8, element::u8}, {element::u8, element::i8, element::u8}, - {element::bf16, element::bf16, element::u8}}; + {element::bf16, element::bf16, element::u8}, + {element::f16, element::f16, element::u8}}; } OV_CPU_JIT_EMITTER_THROW("got BrgemmCPU node with unsupported type"); } -void jit_brgemm_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_brgemm_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(m_memory_offsets.size() == in.size() + 1 && (out.size() == 1), "expects 3 inputs if there are compensations/wsp"); } @@ -82,22 +96,42 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec if (in.size() > 2) mem_ptrs_idxs.emplace_back(in[2]); + if (std::dynamic_pointer_cast(m_kernel_executor)) + emit_call(mem_ptrs_idxs); + else if (std::dynamic_pointer_cast(m_kernel_executor)) + emit_call(mem_ptrs_idxs); + else + OV_CPU_JIT_EMITTER_THROW("uknown execuor type"); +} + +template ::value, bool>::type> +void jit_brgemm_emitter::emit_call(const std::vector& mem_ptrs_idxs) const { EmitABIRegSpills spill(h); spill.preamble(); - h->mov(h->rbp, reinterpret_cast(BrgemmKernelExecutor::execute)); - auto reserved_stack_size = sizeof(BrgemmKernelExecutor::call_args); + h->mov(h->rbp, reinterpret_cast(T::execute)); + auto reserved_stack_size = sizeof(typename T::call_args); // Reserve memory on the stack h->sub(h->rsp, reserved_stack_size); - const bool is_dynamic_case = std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); + const bool is_dynamic_case = + std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); Xbyak::Reg64 aux_reg = is_dynamic_case ? ov::intel_cpu::utils::get_aux_gpr(mem_ptrs_idxs) : Xbyak::Reg64(); - const std::vector brgemm_args_offsets {GET_OFF_BRGEMM_ARGS(A), GET_OFF_BRGEMM_ARGS(B), GET_OFF_BRGEMM_ARGS(C), GET_OFF_BRGEMM_ARGS(scratch)}; +#define GET_OFF_CALL_ARGS(field) offsetof(typename T::call_args, field) + const std::vector brgemm_args_offsets = {GET_OFF_CALL_ARGS(A), + GET_OFF_CALL_ARGS(B), + GET_OFF_CALL_ARGS(C), + GET_OFF_CALL_ARGS(scratch)}; +#undef GET_OFF_CALL_ARGS + const auto& mem_ptrs = utils::transform_idxs_to_regs(mem_ptrs_idxs); for (size_t i = 0; i < mem_ptrs.size(); i++) { if (ov::snippets::utils::is_dynamic_value(m_memory_offsets[i])) - utils::push_ptr_with_runtime_offset_on_stack(h, brgemm_args_offsets[i], mem_ptrs[i], aux_reg, + utils::push_ptr_with_runtime_offset_on_stack(h, + brgemm_args_offsets[i], + mem_ptrs[i], + aux_reg, GET_OFF(buffer_offsets) + m_buffer_ids[i] * sizeof(size_t)); else utils::push_ptr_with_static_offset_on_stack(h, brgemm_args_offsets[i], mem_ptrs[i], m_memory_offsets[i]); @@ -108,8 +142,10 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec h->mov(h->qword[h->rsp + brgemm_args_offsets.back()], reinterpret_cast(nullptr)); // abi_param1 always contains jit_snippets_call_args which has amx tile config for each thread - h->lea(h->r10, h->ptr[abi_param1 + GET_OFF(amx_tile_config)]); - h->mov(h->qword[h->rsp + GET_OFF_BRGEMM_ARGS(amx_tile_config)], h->r10); + if (std::is_same()) { + h->lea(h->r10, h->ptr[abi_param1 + GET_OFF(amx_tile_config)]); + h->mov(h->qword[h->rsp + GET_OFF_BRGEMM_AMX_ARGS(amx_tile_config)], h->r10); + } h->mov(abi_param1, reinterpret_cast(m_kernel_executor.get())); h->mov(abi_param2, h->rsp); @@ -123,5 +159,5 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec spill.postamble(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp index baa6ed95473034..9d072065c0fe52 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp @@ -5,35 +5,46 @@ #pragma once #include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/x64/kernel_executors/brgemm.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_base.hpp" namespace ov { namespace intel_cpu { class jit_brgemm_emitter : public jit_emitter { public: - jit_brgemm_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_brgemm_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache); - size_t get_inputs_num() const override { return m_memory_offsets.size() - 1; } - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + size_t get_inputs_num() const override { + return m_memory_offsets.size() - 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; - // Note: offsets order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if offset is calculated in runtime + template ::value, bool>::type = true> + void emit_call(const std::vector& mem_ptrs_idxs) const; + + // Note: offsets order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if offset is calculated in + // runtime std::vector m_memory_offsets{}; // Note: cluster ids order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if there is no buffer std::vector m_buffer_ids{}; - std::shared_ptr m_kernel_executor = nullptr; + std::shared_ptr m_kernel_executor = nullptr; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter); + friend std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter* emitter); #endif + + bool m_is_with_amx{false}; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp index 05b9d15786157b..45ebfc83899dba 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp @@ -4,9 +4,11 @@ #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_debug_emitter.hpp" -#include -#include "utils/general_utils.h" +# include "jit_debug_emitter.hpp" + +# include + +# include "utils/general_utils.h" using namespace dnnl::impl::cpu; using namespace dnnl::impl; @@ -27,8 +29,10 @@ size_t jit_debug_emitter::aux_gprs_count() const { return m_target_emitter->aux_gprs_count(); } -void jit_debug_emitter::emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_debug_emitter::emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { m_target_emitter->emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); } @@ -52,12 +56,14 @@ void jit_debug_emitter::register_table_entries() { m_target_emitter->register_table_entries(); } -void jit_debug_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_debug_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { m_target_emitter->emit_impl(in_idxs, out_idxs); } -void jit_debug_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_debug_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { if (m_decorator_emit_loc == EmissionLocation::preamble || m_decorator_emit_loc == EmissionLocation::both) m_decorator_emitter->emit_code(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); @@ -67,7 +73,7 @@ void jit_debug_emitter::emit_code(const std::vector &in_idxs, const std: m_decorator_emitter->emit_code(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp index fe7cc527418587..2591af119cc3b5 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp @@ -4,29 +4,33 @@ #ifdef SNIPPETS_DEBUG_CAPS -#pragma once - -#include "emitters/plugin/x64/jit_emitter.hpp" +# pragma once +# include "emitters/plugin/x64/jit_emitter.hpp" namespace ov { namespace intel_cpu { class jit_debug_emitter : public jit_emitter { public: - enum class EmissionLocation { - preamble, - postamble, - both - }; - jit_debug_emitter(const std::shared_ptr& target_emitter, const std::shared_ptr& decorator_emitter, const EmissionLocation& loc) - : jit_emitter(target_emitter->h, target_emitter->host_isa_, target_emitter->exec_prc_, target_emitter->in_out_type_), - m_target_emitter(target_emitter), m_decorator_emitter(decorator_emitter), m_decorator_emit_loc(loc) { - prepare_table(); - } - - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + enum class EmissionLocation { preamble, postamble, both }; + jit_debug_emitter(const std::shared_ptr& target_emitter, + const std::shared_ptr& decorator_emitter, + const EmissionLocation& loc) + : jit_emitter(target_emitter->h, + target_emitter->host_isa_, + target_emitter->exec_prc_, + target_emitter->in_out_type_), + m_target_emitter(target_emitter), + m_decorator_emitter(decorator_emitter), + m_decorator_emit_loc(loc) { + prepare_table(); + } + + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; void emit_data() const override; size_t get_inputs_num() const override; @@ -38,10 +42,12 @@ class jit_debug_emitter : public jit_emitter { void prepare_table() override; void register_table_entries() override; - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; - void emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const override; + void emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; void emitter_postamble() const override; private: @@ -54,7 +60,7 @@ class jit_debug_emitter : public jit_emitter { EmissionLocation m_decorator_emit_loc; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp index 1c05100317ae5f..687917acbabc5a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp @@ -4,16 +4,15 @@ #include "jit_fill_emitter.hpp" - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; - namespace ov { namespace intel_cpu { -jit_fill_emitter::jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_fill_emitter::jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) : jit_emitter(h, isa, ov::element::f32, emitter_in_out_map::vec_to_vec) { const auto fill = ov::as_type_ptr(expr->get_node()); @@ -52,9 +51,9 @@ void jit_fill_emitter::emit_impl(const std::vector& in, const std::vecto } template -void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; +void jit_fill_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Vmm src_vmm = Vmm(in[0]); Vmm dst_vmm = Vmm(out[0]); @@ -62,7 +61,8 @@ void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector const size_t supported_et_size = 4; const auto register_capacity = (src_vmm.getBit() / 8) / supported_et_size; if (offset == register_capacity) { - // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be removed from the LIR + // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be + // removed from the LIR // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic. // Ticket: 126270 if (src_vmm.getIdx() != dst_vmm.getIdx()) @@ -105,5 +105,5 @@ void jit_fill_emitter::fill_tail(const Vmm& src_vmm, const Vmm& dst_vmm) const { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp index 79e9a0e4027a5d..23b929cc161ca7 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp @@ -6,15 +6,18 @@ #include "emitters/plugin/x64/jit_emitter.hpp" - namespace ov { namespace intel_cpu { class jit_fill_emitter : public jit_emitter { public: - jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } protected: size_t aux_gprs_count() const override; @@ -23,18 +26,22 @@ class jit_fill_emitter : public jit_emitter { void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; template void fill_full(const Vmm& vmm_dst) const; template void fill_tail(const Vmm& vmm_src, const Vmm& vmm_dst) const; - bool is_full_reg() const { return offset == 0; } - bool is_optimized() const { return is_full_reg() && fill_value == uint32_t(0x0); } + bool is_full_reg() const { + return offset == 0; + } + bool is_optimized() const { + return is_full_reg() && fill_value == uint32_t(0x0); + } size_t offset = 0; uint32_t fill_value = 0x0; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp index a4f5cbe16d7e1f..34e9c2f71fd148 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp @@ -4,7 +4,6 @@ #include "jit_horizon_emitter.hpp" - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -12,7 +11,8 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -jit_horizon_emitter::jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_horizon_emitter::jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) : jit_emitter(h, isa, ov::element::f32, emitter_in_out_map::vec_to_vec) { if (ov::is_type(expr->get_node())) { @@ -24,8 +24,7 @@ jit_horizon_emitter::jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, } } -void jit_horizon_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_horizon_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::x64::sse41) { emit_isa(in, out); } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { @@ -38,9 +37,12 @@ void jit_horizon_emitter::emit_impl(const std::vector& in, } template -void jit_horizon_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_horizon_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using Vmm = typename dnnl::impl::utils::conditional3::type; + Xbyak::Xmm, + isa == dnnl::impl::cpu::x64::avx2, + Xbyak::Ymm, + Xbyak::Zmm>::type; Vmm src_vmm = Vmm(in[0]); Vmm dst_vmm = Vmm(out[0]); @@ -67,19 +69,19 @@ void jit_horizon_emitter::emit_isa(const std::vector &in, const std::vec perform_op(dst_vmm, dst_vmm, aux_vmm); } -template -void jit_horizon_emitter::perform_op(const Vmm &vmm1, const Vmm &vmm2, const Vmm &vmm3) const { +template +void jit_horizon_emitter::perform_op(const Vmm& vmm1, const Vmm& vmm2, const Vmm& vmm3) const { switch (m_op_type) { - case OpType::max: - h->uni_vmaxps(vmm1, vmm2, vmm3); - break; - case OpType::sum: - h->uni_vaddps(vmm1, vmm2, vmm3); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported horizontal operation."); + case OpType::max: + h->uni_vmaxps(vmm1, vmm2, vmm3); + break; + case OpType::sum: + h->uni_vaddps(vmm1, vmm2, vmm3); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported horizontal operation."); } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp index 1b222cb2a86776..df74b2ad9783a4 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp @@ -6,34 +6,40 @@ #include "emitters/plugin/x64/jit_emitter.hpp" - namespace ov { namespace intel_cpu { class jit_horizon_emitter : public jit_emitter { public: - jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_vecs_count() const override {return 1;} + size_t aux_vecs_count() const override { + return 1; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; - template - void perform_op(const Vmm &vmm1, const Vmm &vmm2, const Vmm &vmm3) const; + template + void perform_op(const Vmm& vmm1, const Vmm& vmm2, const Vmm& vmm3) const; enum class OpType { max, sum }; OpType m_op_type = OpType::max; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp index 476123355abe70..bd5a3227e1e125 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp @@ -14,8 +14,11 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), reg_runtime_params_idx(abi_param1.getIdx()) { +jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_emitter(h, isa), + reg_runtime_params_idx(abi_param1.getIdx()) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "invoked with invalid op argument"); OV_CPU_JIT_EMITTER_ASSERT(!kernel->region->empty(), "invoked with empty body"); @@ -59,8 +62,12 @@ void jit_kernel_emitter::init_reg_pools(const std::set& gpr_blacklist, c gp_regs_pool[i] = vec_regs_pool[i] = 15 - i; auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { // It's important to keep the order of other elements - pool.erase(std::remove_if(pool.begin(), pool.end(), - [&](size_t x) {return to_remove.count(x) != 0;}), pool.end()); + pool.erase(std::remove_if(pool.begin(), + pool.end(), + [&](size_t x) { + return to_remove.count(x) != 0; + }), + pool.end()); }; // Reserve stack base and pointer for push(...) and pop(...) operations std::set gprs_blacklist_extended{Xbyak::Operand::RSP, Xbyak::Operand::RBP}; @@ -70,25 +77,31 @@ void jit_kernel_emitter::init_reg_pools(const std::set& gpr_blacklist, c remove_regs_from_pool(vec_regs_pool, vec_blacklist); } -void jit_kernel_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_kernel_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } -void jit_kernel_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_kernel_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty() && out.empty(), ": expects 0 registers on input and output"); const auto num_params = num_inputs + num_outputs + num_unique_buffers; // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount OV_CPU_JIT_EMITTER_ASSERT(data_ptr_regs_idx.size() == num_params, - "number of inputs and outputs is inconsistent with the number of allocated registers ", num_params, - " data_ptr_regs_idx.size() = ", data_ptr_regs_idx.size()); + "number of inputs and outputs is inconsistent with the number of allocated registers ", + num_params, + " data_ptr_regs_idx.size() = ", + data_ptr_regs_idx.size()); } void jit_kernel_emitter::init_body_regs(const std::set& kernel_regs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) { // Initialize pools of gp and vec registers - // Reserve kernel regs (abi_param1 and, if there is, abi_param2), since they'll be used to pass runtime call args to kernel + // Reserve kernel regs (abi_param1 and, if there is, abi_param2), since they'll be used to pass runtime call args to + // kernel init_reg_pools(kernel_regs, {}); mapping_info gpr_map_pool({}, gp_regs_pool); @@ -122,9 +135,11 @@ void jit_kernel_emitter::emit_impl(const std::vector& in, const std::vec h->postamble(); } -jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_kernel_emitter(h, isa, expr), reg_indexes_idx(abi_param2.getIdx()) { + : jit_kernel_emitter(h, isa, expr), + reg_indexes_idx(abi_param2.getIdx()) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "expectes KernelStatic expression"); jcp = *reinterpret_cast(kernel->compile_params); @@ -158,12 +173,12 @@ void jit_kernel_static_emitter::init_data_pointers(const std::vector(*spare_corruptable_gpr)); + Reg64 reg_tmp = + last_iter_explicitly ? data_ptr_regs[num_params - 1] : Reg64(static_cast(*spare_corruptable_gpr)); // Vector "data_ptr_regs" is sorted by abstract regs. // It means that the vector contains the physical registers in order [src, .., src, dst, .., dst, buffer] // So we can initialize buffer register firstly as last value of vector "data_ptr_regs" @@ -193,13 +208,15 @@ void jit_kernel_static_emitter::init_data_pointers(const std::vector(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel, "expectes KernelDynamic expression"); - // - Reserve abi_param1, since it wll be used to pass runtime call args to all dynamic emitters that needs runtime args + // - Reserve abi_param1, since it wll be used to pass runtime call args to all dynamic emitters that needs runtime + // args // - We cannot assign this register to the body emitters since runtime params MUST be valid during whole execution // for all dynamic emitters init_body_regs({reg_runtime_params_idx}); @@ -220,5 +237,5 @@ void jit_kernel_dynamic_emitter::init_data_pointers(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + jit_kernel_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 0; + } + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: void validate_arguments(const std::vector& in, const std::vector& out) const override; - void init_body_regs(const std::set& kernel_regs, const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); + void init_body_regs(const std::set& kernel_regs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}); /** - * @brief populates physical registers pools for x86 (both vec and gp). + * @brief populates physical registers pools for x86 (both vec and gp). * Skips stack-related gprs and extra gprs passed as arguments. * @arg gpr_blacklist - set of gp registers that should not be added to register pool * @arg vec_blacklist - set of vec registers should not be added to register pool - */ + */ void init_reg_pools(const std::set& gpr_blacklist, const std::set& vec_blacklist); virtual void init_data_pointers(const std::vector& data_ptr_regs) const = 0; @@ -70,13 +77,15 @@ class jit_kernel_emitter : public jit_emitter, public jit_container_emitter { std::shared_ptr body; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter *emitter); + friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter* emitter); #endif }; class jit_kernel_static_emitter : public jit_kernel_emitter { public: - jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; @@ -86,21 +95,23 @@ class jit_kernel_static_emitter : public jit_kernel_emitter { std::vector> data_offsets; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter *emitter); + friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* emitter); #endif }; class jit_kernel_dynamic_emitter : public jit_kernel_emitter { public: - jit_kernel_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_kernel_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter *emitter); + friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter* emitter); #endif }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp index f3151d0df4ccb1..86421678a29011 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp @@ -18,8 +18,11 @@ namespace intel_cpu { namespace { class jit_aux_gpr_holder { public: - jit_aux_gpr_holder(dnnl::impl::cpu::x64::jit_generator* host, std::vector& pool_gpr_idxs, const std::vector& used_gpr_idxs) - : m_h(host), m_pool_gpr_idxs(pool_gpr_idxs) { + jit_aux_gpr_holder(dnnl::impl::cpu::x64::jit_generator* host, + std::vector& pool_gpr_idxs, + const std::vector& used_gpr_idxs) + : m_h(host), + m_pool_gpr_idxs(pool_gpr_idxs) { // If the pool is empty, let's manualy allocate the gpr and push original vlaue on stack if (m_pool_gpr_idxs.empty()) { m_aux_gpr_idx = ov::intel_cpu::utils::get_aux_gpr(used_gpr_idxs); @@ -39,21 +42,26 @@ class jit_aux_gpr_holder { } } - const Reg64& get_reg() const { return m_aux_gpr_idx; } + const Reg64& get_reg() const { + return m_aux_gpr_idx; + } private: dnnl::impl::cpu::x64::jit_generator* m_h; std::vector& m_pool_gpr_idxs; - Reg64 m_aux_gpr_idx {}; + Reg64 m_aux_gpr_idx{}; bool m_is_preserved = false; }; } // namespace /* ================== jit_loop_begin_emitter ====================== */ -jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{new Xbyak::Label()}, loop_end_label(nullptr) { + : jit_emitter(h, isa), + loop_begin_label{new Xbyak::Label()}, + loop_end_label(nullptr) { const auto loop_begin = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_begin, "expects LoopBegin expression"); const auto loop_end = loop_begin->get_loop_end(); @@ -65,7 +73,7 @@ jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generat in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void jit_loop_begin_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_loop_begin_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty(), "Invalid inputs size: expected 0 got " + std::to_string(in.size())); // Note: the only expected output is work amount register (communicated to jit_loop_end_emitter) OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Invalid outputs size: expected 1 got " + std::to_string(out.size())); @@ -74,21 +82,24 @@ void jit_loop_begin_emitter::validate_arguments(const std::vector &in, c "loop increment might be dynamic only if loop evaluates once!"); } -void jit_loop_begin_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_begin_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); jit_emitter::emit_code(in, out, pool_vec_idxs, pool_gpr_idxs); } void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std::vector& out) const { // If the loop evaulate once, we can skip loop begin code emission - // If work_amount is dynamic, we should get runtime `work_amount` - it might be `zero` and we should skip loop evaluation + // If work_amount is dynamic, we should get runtime `work_amount` - it might be `zero` and we should skip loop + // evaluation if (evaluate_once && !is_work_amount_dynamic) return; Reg64 reg_work_amount = Reg64(static_cast(out.back())); if (is_work_amount_dynamic) { - jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, out); // loop_begin has only output registers + jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, out); // loop_begin has only output registers Reg64 reg_loop_args_ptr = gpr_holder.get_reg(); const auto id_offset = loop_id * sizeof(jit_snippets_call_args::loop_args_t); h->mov(reg_loop_args_ptr, h->ptr[abi_param1 + GET_OFF(loop_args)]); @@ -113,9 +124,12 @@ void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std: /* ================== jit_loop_end_emitter ====================== */ -jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{nullptr}, loop_end_label{new Xbyak::Label()} { + : jit_emitter(h, isa), + loop_begin_label{nullptr}, + loop_end_label{new Xbyak::Label()} { in_out_type_ = emitter_in_out_map::gpr_to_gpr; const auto loop_end = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_end != nullptr, "expected LoopEnd expr"); @@ -132,8 +146,9 @@ jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* are_ptr_increments_dynamic = std::any_of(ptr_increments.cbegin(), ptr_increments.cend(), ov::snippets::utils::is_dynamic_value); - are_final_offsets_dynamic = - std::any_of(finalization_offsets.cbegin(), finalization_offsets.cend(), ov::snippets::utils::is_dynamic_value); + are_final_offsets_dynamic = std::any_of(finalization_offsets.cbegin(), + finalization_offsets.cend(), + ov::snippets::utils::is_dynamic_value); are_ptr_shifts_dynamic = are_ptr_increments_dynamic || are_final_offsets_dynamic; const auto begin_expr = get_loop_begin_expr(expr); @@ -143,29 +158,51 @@ jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* loop_begin_label = loop_begin_emitter->get_begin_label(); } -ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr) { +ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr( + const ov::snippets::lowered::ExpressionPtr& expr) { const auto begin_expr = expr->get_input_port_connectors().back()->get_source().get_expr(); OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(begin_expr->get_node()), "LoopEnd expression must have th last port connector to LoopBegin"); return begin_expr; } -void jit_loop_end_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_loop_end_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { const auto io_size = num_inputs + num_outputs; OV_CPU_JIT_EMITTER_ASSERT(out.size() == 0, "Invalid number of out arguments: expected ", 0, " got ", out.size()); - OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, "Invalid number of in arguments: expected ", io_size + 1, " got ", in.size()); - OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, "Invalid is_incremented size: expected ", io_size, " got ", is_incremented.size()); - OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, "Invalid ptr_increments size: expected ", io_size, " got ", ptr_increments.size()); + OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, + "Invalid number of in arguments: expected ", + io_size + 1, + " got ", + in.size()); + OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, + "Invalid is_incremented size: expected ", + io_size, + " got ", + is_incremented.size()); + OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, + "Invalid ptr_increments size: expected ", + io_size, + " got ", + ptr_increments.size()); OV_CPU_JIT_EMITTER_ASSERT(finalization_offsets.size() == io_size, - "Invalid finalization_offsets size: expected: ", io_size, " got ", finalization_offsets.size()); - OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, "Invalid data_sizes size: expected: ", io_size, " got ", data_sizes.size()); + "Invalid finalization_offsets size: expected: ", + io_size, + " got ", + finalization_offsets.size()); + OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, + "Invalid data_sizes size: expected: ", + io_size, + " got ", + data_sizes.size()); OV_CPU_JIT_EMITTER_ASSERT(loop_end_label != nullptr && loop_begin_label != nullptr, "has not inited labels!"); OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_value(wa_increment) || evaluate_once, "loop increment might be dynamic only if loop evaluates once!"); } -void jit_loop_end_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_end_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); jit_emitter::emit_code(in, out, pool_vec_idxs, pool_gpr_idxs); } @@ -176,34 +213,38 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v data_ptr_reg_idxs.reserve(num_inputs + num_outputs); std::copy(in.begin(), in.end() - 1, std::back_inserter(data_ptr_reg_idxs)); - auto apply_increments = [&](bool use_runtime_args, size_t field_offset, const std::vector& increments, size_t scale) { - Reg64 reg_increments; - auto add_increments = [&]() { - for (size_t idx = 0; idx < data_ptr_reg_idxs.size(); idx++) { - const auto& increment = increments[idx]; - if (is_incremented[idx] && increment != 0) { - if (ov::snippets::utils::is_dynamic_value(increment)) { - OV_CPU_JIT_EMITTER_ASSERT(use_runtime_args, "Loop argument structure cannot be pushed to aux GPR"); - h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), h->ptr[reg_increments + idx * sizeof(int64_t)]); - } else { - h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), increment * scale * data_sizes[idx]); + auto apply_increments = + [&](bool use_runtime_args, size_t field_offset, const std::vector& increments, size_t scale) { + Reg64 reg_increments; + auto add_increments = [&]() { + for (size_t idx = 0; idx < data_ptr_reg_idxs.size(); idx++) { + const auto& increment = increments[idx]; + if (is_incremented[idx] && increment != 0) { + if (ov::snippets::utils::is_dynamic_value(increment)) { + OV_CPU_JIT_EMITTER_ASSERT(use_runtime_args, + "Loop argument structure cannot be pushed to aux GPR"); + h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), + h->ptr[reg_increments + idx * sizeof(int64_t)]); + } else { + h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), + increment * scale * data_sizes[idx]); + } } } + }; + + const auto id_offset = loop_id * sizeof(jit_snippets_call_args::loop_args_t); + if (use_runtime_args) { + jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, in); // loop_end has only input registers + reg_increments = gpr_holder.get_reg(); + h->mov(reg_increments, h->ptr[abi_param1 + GET_OFF(loop_args)]); + h->mov(reg_increments, h->ptr[reg_increments + id_offset + field_offset]); + add_increments(); + } else { + add_increments(); } }; - const auto id_offset = loop_id * sizeof(jit_snippets_call_args::loop_args_t); - if (use_runtime_args) { - jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, in); // loop_end has only input registers - reg_increments = gpr_holder.get_reg(); - h->mov(reg_increments, h->ptr[abi_param1 + GET_OFF(loop_args)]); - h->mov(reg_increments, h->ptr[reg_increments + id_offset + field_offset]); - add_increments(); - } else { - add_increments(); - } - }; - if (!evaluate_once) { apply_increments(are_ptr_increments_dynamic, GET_OFF_LOOP_ARGS(m_ptr_increments), ptr_increments, wa_increment); @@ -220,5 +261,5 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v /* ============================================================== */ -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp index 262bba39b7d74c..c0a2b53b100c62 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp @@ -5,7 +5,6 @@ #pragma once #include "emitters/plugin/x64/jit_emitter.hpp" - #include "snippets/op/loop.hpp" #include "snippets/utils/utils.hpp" @@ -14,25 +13,36 @@ namespace intel_cpu { /* ================== jit_loop_begin_emitter ====================== */ -class jit_loop_begin_emitter: public jit_emitter { +class jit_loop_begin_emitter : public jit_emitter { public: - jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override { return 0; } + size_t get_inputs_num() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; - void set_loop_end_label(const std::shared_ptr& label) { loop_end_label = label; } - std::shared_ptr get_begin_label() { return loop_begin_label; } + void set_loop_end_label(const std::shared_ptr& label) { + loop_end_label = label; + } + std::shared_ptr get_begin_label() { + return loop_begin_label; + } protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; // `jit_loop_begin_emitter` handles manually aux_gpr allocation using `jit_aux_gpr_holder` - size_t aux_gprs_count() const override { return 0; } + size_t aux_gprs_count() const override { + return 0; + } std::shared_ptr loop_begin_label = nullptr; std::shared_ptr loop_end_label = nullptr; @@ -43,27 +53,33 @@ class jit_loop_begin_emitter: public jit_emitter { bool is_work_amount_dynamic = false; }; - /* ============================================================== */ /* ================== jit_loop_end_emitter ====================== */ -class jit_loop_end_emitter: public jit_emitter { +class jit_loop_end_emitter : public jit_emitter { public: - jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override { return 0; } + size_t get_inputs_num() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; // `jit_loop_end_emitter` handles manually aux_gpr allocation using `jit_aux_gpr_holder` - size_t aux_gprs_count() const override { return 0; } + size_t aux_gprs_count() const override { + return 0; + } static ov::snippets::lowered::ExpressionPtr get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr); @@ -86,5 +102,5 @@ class jit_loop_end_emitter: public jit_emitter { /* ============================================================== */ -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp index b7a5fc2e993398..307ef63a8e6a2e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp @@ -5,10 +5,9 @@ #include "jit_memory_emitters.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "snippets/op/buffer.hpp" #include "transformations/snippets/x64/op/load_convert.hpp" #include "transformations/snippets/x64/op/store_convert.hpp" -#include "snippets/op/buffer.hpp" - using namespace Xbyak; using namespace dnnl::impl; @@ -21,7 +20,10 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr, emitter_in_out_map in_out_type) +jit_memory_emitter::jit_memory_emitter(jit_generator* h, + cpu_isa_t isa, + const ExpressionPtr& expr, + emitter_in_out_map in_out_type) : jit_emitter(h, isa) { in_out_type_ = in_out_type; @@ -36,7 +38,8 @@ jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const Ex compiled_byte_offset = memory_access->get_input_offset(); buffer_cluster_id = get_parent_buffer_cluster_id(expr); } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { - OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), "must be output port - memory access"); + OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), + "must be output port - memory access"); count = memory_access->get_output_count(); compiled_byte_offset = memory_access->get_output_offset(); buffer_cluster_id = get_consumer_buffer_cluster_id(expr); @@ -46,7 +49,8 @@ jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const Ex if (ov::snippets::utils::is_dynamic_value(compiled_byte_offset)) { is_offset_runtime = true; - // Compiled byte offset is zero to manually `add` runtime offset before operation and `sub` after to reset pointer in the register + // Compiled byte offset is zero to manually `add` runtime offset before operation and `sub` after to reset + // pointer in the register compiled_byte_offset = 0; OV_CPU_JIT_EMITTER_ASSERT(buffer_cluster_id != SIZE_MAX, "Incorrect buffer offset in call_args"); } @@ -84,8 +88,10 @@ std::vector jit_memory_emitter::get_available_aux_gprs() const { return available_aux_gprs; } -void jit_memory_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_memory_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); Reg64 reg_runtime_params = abi_param1; // defined by jit_kernel_emitter @@ -152,19 +158,26 @@ void jit_load_broadcast_emitter::emit_impl(const std::vector& in, const } template -void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; +void jit_load_broadcast_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Reg64 in_reg(in[0]); Vmm vmm_dst = Vmm(out[0]); - // It doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, - // key point here is not to add post-increment, it might be fixed by some other approach in future + // It doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for + // `BroadcastLoad`, key point here is not to add post-increment, it might be fixed by some other approach in future switch (src_prc.size()) { - case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; - case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; - case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported data type"); + case 4: + h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); + break; + case 2: + h->vpbroadcastw(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); + break; + case 1: + h->vpbroadcastb(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported data type"); } } @@ -190,5 +203,5 @@ void jit_store_memory_emitter::emit_data() const { store_emitter->emit_data(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp index 55a41c977dd67c..d21e85d53e7193 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp @@ -7,17 +7,20 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/plugin/x64/jit_load_store_emitters.hpp" - namespace ov { namespace intel_cpu { -class jit_memory_emitter : public jit_emitter { +class jit_memory_emitter : public jit_emitter { public: - jit_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr, emitter_in_out_map in_out_type); + jit_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, + emitter_in_out_map in_out_type); - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: static size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); @@ -36,16 +39,19 @@ class jit_memory_emitter : public jit_emitter { bool is_offset_runtime = false; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter); + friend std::string init_info_jit_memory_emitter(const jit_memory_emitter* emitter); #endif }; class jit_load_memory_emitter : public jit_memory_emitter { public: - jit_load_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_load_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 0;} + size_t get_inputs_num() const override { + return 0; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; @@ -58,24 +64,30 @@ class jit_load_memory_emitter : public jit_memory_emitter { class jit_load_broadcast_emitter : public jit_memory_emitter { public: - jit_load_broadcast_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_load_broadcast_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 0;} + size_t get_inputs_num() const override { + return 0; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; }; -class jit_store_memory_emitter : public jit_memory_emitter { +class jit_store_memory_emitter : public jit_memory_emitter { public: - jit_store_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_store_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; @@ -86,5 +98,5 @@ class jit_store_memory_emitter : public jit_memory_emitter { std::unique_ptr store_emitter = nullptr; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp index f89e906ce57593..ccb4da742e38d6 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp @@ -3,9 +3,9 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_perf_count_chrono_emitters.hpp" +# include "jit_perf_count_chrono_emitters.hpp" -#include "emitters/plugin/x64/utils.hpp" +# include "emitters/plugin/x64/utils.hpp" using namespace dnnl::impl; using namespace dnnl::impl::utils; @@ -17,8 +17,10 @@ using namespace Xbyak::util; namespace ov { namespace intel_cpu { -jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { +jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { m_start_node = ov::as_type_ptr(n); } @@ -30,11 +32,12 @@ void jit_perf_count_chrono_start_emitter::set_start_time(snippets::op::PerfCount start_node->set_start_time(); } -void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { EmitABIRegSpills spill(h); spill.preamble(); - const auto &set_start_time_overload = static_cast(set_start_time); + const auto& set_start_time_overload = static_cast(set_start_time); h->mov(h->rax, reinterpret_cast(set_start_time_overload)); h->mov(abi_param1, reinterpret_cast(m_start_node.get())); @@ -46,8 +49,10 @@ void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector &i } ///////////////////jit_perf_count_chrono_end_emitter//////////////////////////////////// -jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { +jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { m_end_node = ov::as_type_ptr(n); } @@ -59,11 +64,13 @@ void jit_perf_count_chrono_end_emitter::set_accumulated_time(snippets::op::PerfC end_node->set_accumulated_time(); } -void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { EmitABIRegSpills spill(h); spill.preamble(); - const auto &set_accumulated_time_overload = static_cast(set_accumulated_time); + const auto& set_accumulated_time_overload = + static_cast(set_accumulated_time); h->mov(h->rax, reinterpret_cast(set_accumulated_time_overload)); h->mov(abi_param1, reinterpret_cast(m_end_node.get())); @@ -74,6 +81,6 @@ void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector &in_ spill.postamble(); } -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp index e8608afc7f1428..817c0583609778 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp @@ -3,24 +3,23 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#pragma once - -#include "emitters/plugin/x64/jit_emitter.hpp" - -#include "snippets/op/perf_count.hpp" +# pragma once +# include "emitters/plugin/x64/jit_emitter.hpp" +# include "snippets/op/perf_count.hpp" namespace ov { namespace intel_cpu { class jit_perf_count_chrono_start_emitter : public jit_emitter { public: - jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; static void set_start_time(snippets::op::PerfCountBegin* start_node); std::shared_ptr m_start_node = nullptr; @@ -28,17 +27,18 @@ class jit_perf_count_chrono_start_emitter : public jit_emitter { class jit_perf_count_chrono_end_emitter : public jit_emitter { public: - jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; static void set_accumulated_time(snippets::op::PerfCountEnd* end_node); std::shared_ptr m_end_node = nullptr; }; -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp index c469c052ce3ef6..e951f8042ad762 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp @@ -3,7 +3,7 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_perf_count_rdtsc_emitters.hpp" +# include "jit_perf_count_rdtsc_emitters.hpp" using namespace dnnl::impl; using namespace dnnl::impl::utils; @@ -15,8 +15,10 @@ using namespace Xbyak::util; namespace ov { namespace intel_cpu { -jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { +jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { m_start_node = ov::as_type_ptr(n); } @@ -24,16 +26,18 @@ size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const { return 0; } -void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { h->push(h->rax); h->push(h->rdx); - // The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the low-order 32 bits. + // The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the + // low-order 32 bits. h->lfence(); h->rdtsc(); h->lfence(); - h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32) - h->or_(h->rdx, h->rax); // rdx has current tsc + h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32) + h->or_(h->rdx, h->rax); // rdx has current tsc h->mov(h->rax, reinterpret_cast(&m_start_node->start_count)); h->mov(qword[h->rax], h->rdx); @@ -43,16 +47,19 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector &in } ///////////////////jit_perf_count_rdtsc_end_emitter//////////////////////////////////// -jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { - m_end_node = ov::as_type_ptr(n); +jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { + m_end_node = ov::as_type_ptr(n); } size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const { return 0; } -void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { h->push(h->rax); h->push(h->rdx); @@ -79,6 +86,6 @@ void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector &in_i h->pop(h->rax); } -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp index c3ae1aac01ab9d..343807bdfcd076 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp @@ -3,40 +3,40 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#pragma once - -#include "emitters/plugin/x64/jit_emitter.hpp" - -#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" +# pragma once +# include "emitters/plugin/x64/jit_emitter.hpp" +# include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" namespace ov { namespace intel_cpu { class jit_perf_count_rdtsc_start_emitter : public jit_emitter { public: - jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n); + jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; std::shared_ptr m_start_node = nullptr; }; class jit_perf_count_rdtsc_end_emitter : public jit_emitter { public: - jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n); + jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; std::shared_ptr m_end_node = nullptr; }; -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp index f88c345ff055b5..c513e969144d1c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp @@ -4,8 +4,9 @@ #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_segfault_detector_emitter.hpp" -#include "emitters/plugin/x64/utils.hpp" +# include "jit_segfault_detector_emitter.hpp" + +# include "emitters/plugin/x64/utils.hpp" using namespace dnnl::impl::utils; using namespace dnnl::impl; @@ -18,22 +19,28 @@ namespace intel_cpu { std::shared_ptr> g_custom_segfault_handler = std::make_shared>(); -jit_uni_segfault_detector_emitter::jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - jit_emitter* target_emitter, bool is_load, bool is_store, std::string target_node_name) : - jit_emitter(host, host_isa), - m_target_emitter(target_emitter), - is_target_use_load_emitter(is_load), - is_target_use_store_emitter(is_store), - m_target_node_name(target_node_name) { +jit_uni_segfault_detector_emitter::jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_emitter* target_emitter, + bool is_load, + bool is_store, + std::string target_node_name) + : jit_emitter(host, host_isa), + m_target_emitter(target_emitter), + is_target_use_load_emitter(is_load), + is_target_use_store_emitter(is_store), + m_target_node_name(target_node_name) {} + +size_t jit_uni_segfault_detector_emitter::get_inputs_num() const { + return 1; } -size_t jit_uni_segfault_detector_emitter::get_inputs_num() const { return 1; } - const jit_emitter* jit_uni_segfault_detector_emitter::get_target_emitter() const { return m_target_emitter; } -void jit_uni_segfault_detector_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_uni_segfault_detector_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { save_target_emitter(); if (is_target_use_load_emitter) { memory_track(in_vec_idxs[0]); @@ -47,7 +54,8 @@ void jit_uni_segfault_detector_emitter::save_target_emitter() const { EmitABIRegSpills spill(h); spill.preamble(); - const auto &set_local_handler_overload = static_cast(set_local_handler); + const auto& set_local_handler_overload = + static_cast(set_local_handler); h->mov(h->rax, reinterpret_cast(set_local_handler_overload)); h->mov(abi_param1, reinterpret_cast(this)); @@ -85,7 +93,7 @@ void jit_uni_segfault_detector_emitter::memory_track(size_t gpr_idx_for_mem_addr h->pop(h->r15); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp index 21ffaa84cf3db8..86191ae865fe38 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp @@ -4,11 +4,12 @@ #ifdef SNIPPETS_DEBUG_CAPS -#pragma once +# pragma once -#include -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "openvino/runtime/threading/thread_local.hpp" +# include + +# include "emitters/plugin/x64/jit_emitter.hpp" +# include "openvino/runtime/threading/thread_local.hpp" using namespace ov::threading; @@ -20,18 +21,22 @@ extern std::shared_ptr> g_custom class jit_uni_segfault_detector_emitter : public jit_emitter { public: - jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - jit_emitter* target_emitter, bool is_load, bool is_store, std::string target_node_name); + jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_emitter* target_emitter, + bool is_load, + bool is_store, + std::string target_node_name); size_t get_inputs_num() const override; const jit_emitter* get_target_emitter() const; private: - // emit code is to save "this" pointer(jit_uni_segfault_detector_emitter) to global handler, then print info w/ it's target_emitter. - // and to save tracked memory address, iteration, etc to print + // emit code is to save "this" pointer(jit_uni_segfault_detector_emitter) to global handler, then print info w/ it's + // target_emitter. and to save tracked memory address, iteration, etc to print void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; - jit_emitter *m_target_emitter = nullptr; + jit_emitter* m_target_emitter = nullptr; bool is_target_use_load_emitter = false; bool is_target_use_store_emitter = false; std::string m_target_node_name = ""; @@ -44,10 +49,10 @@ class jit_uni_segfault_detector_emitter : public jit_emitter { mutable size_t current_address = 0; mutable size_t iteration = 0; - friend std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter *emitter); + friend std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter* emitter); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp index d8066f9a126543..ba4012de86d83d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp @@ -15,7 +15,10 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_nop_emitter::jit_nop_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr, emitter_in_out_map emitter_type) +jit_nop_emitter::jit_nop_emitter(jit_generator* h, + cpu_isa_t isa, + const ExpressionPtr& expr, + emitter_in_out_map emitter_type) : jit_emitter(h, isa) { in_out_type_ = emitter_type; } @@ -25,7 +28,8 @@ jit_parameter_emitter::jit_parameter_emitter(jit_generator* h, cpu_isa_t isa, co in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -jit_result_emitter::jit_result_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_nop_emitter(h, isa, expr) { +jit_result_emitter::jit_result_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_nop_emitter(h, isa, expr) { in_out_type_ = emitter_in_out_map::gpr_to_gpr; } @@ -34,14 +38,13 @@ jit_broadcast_move_emitter::jit_broadcast_move_emitter(jit_generator* h, cpu_isa const auto n = expr->get_node(); if (n->get_input_element_type(0) != n->get_output_element_type(0)) OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ", - n->get_input_element_type(0), - " and ", - n->get_output_element_type(0)); + n->get_input_element_type(0), + " and ", + n->get_output_element_type(0)); byte_size = n->get_input_element_type(0).size(); } -void jit_broadcast_move_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_broadcast_move_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::x64::sse41) { emit_isa(in, out); } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { @@ -54,17 +57,24 @@ void jit_broadcast_move_emitter::emit_impl(const std::vector& in, } template -void jit_broadcast_move_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; +void jit_broadcast_move_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Xmm xmm_src0 = Xmm(in[0]); - Vmm vmm_dst = Vmm(out[0]); + Vmm vmm_dst = Vmm(out[0]); switch (byte_size) { - case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break; - case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break; - case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break; - default: OV_CPU_JIT_EMITTER_THROW("unsupported data type"); + case 4: + h->uni_vbroadcastss(vmm_dst, xmm_src0); + break; + case 2: + h->vpbroadcastw(vmm_dst, xmm_src0); + break; + case 1: + h->vpbroadcastb(vmm_dst, xmm_src0); + break; + default: + OV_CPU_JIT_EMITTER_THROW("unsupported data type"); } } @@ -74,14 +84,20 @@ int32_t jit_scalar_emitter::read_value(const ov::snippets::lowered::ExpressionPt const auto& precision = n->get_output_element_type(0); int32_t res = INT_MIN; switch (precision) { - case element::i32: res = n->cast_vector(1)[0]; break; - case element::f32: res = dnnl::impl::cpu::x64::float2int(n->cast_vector(1)[0]); break; - default: OV_CPU_JIT_EMITTER_THROW("doesn't support ", precision); + case element::i32: + res = n->cast_vector(1)[0]; + break; + case element::f32: + res = dnnl::impl::cpu::x64::float2int(n->cast_vector(1)[0]); + break; + default: + OV_CPU_JIT_EMITTER_THROW("doesn't support ", precision); } return res; } -jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { +jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_emitter(h, isa) { push_arg_entry_of("scalar", read_value(expr), true); prepare_table(); } @@ -89,21 +105,27 @@ jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const Ex void jit_scalar_emitter::emit_impl(const std::vector& in, const std::vector& out) const { using isa = cpu_isa_t; switch (host_isa_) { - case isa::sse41: emit_isa(in, out); break; - case isa::avx2: emit_isa(in, out); break; - case isa::avx512_core: emit_isa(in, out); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported isa ", host_isa_); + case isa::sse41: + emit_isa(in, out); + break; + case isa::avx2: + emit_isa(in, out); + break; + case isa::avx512_core: + emit_isa(in, out); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported isa ", host_isa_); } } template -void jit_scalar_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Vmm vmm_dst = Vmm(out[0]); +void jit_scalar_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; + Vmm vmm_dst = Vmm(out[0]); h->uni_vbroadcastss(vmm_dst, table_val("scalar")); } - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp index c75f071c4ec7e0..6a91e3b7c47d3d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp @@ -6,16 +6,19 @@ #include "emitters/plugin/x64/jit_emitter.hpp" - namespace ov { namespace intel_cpu { class jit_nop_emitter : public jit_emitter { public: - jit_nop_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr, emitter_in_out_map emitter_type = gpr_to_gpr); + jit_nop_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, + emitter_in_out_map emitter_type = gpr_to_gpr); - size_t get_inputs_num() const override {return 0;} + size_t get_inputs_num() const override { + return 0; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override {} @@ -23,31 +26,40 @@ class jit_nop_emitter : public jit_emitter { class jit_parameter_emitter : public jit_nop_emitter { public: - jit_parameter_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_parameter_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override { return 0; } + size_t get_inputs_num() const override { + return 0; + } }; class jit_result_emitter : public jit_nop_emitter { public: - jit_result_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_result_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } }; class jit_broadcast_move_emitter : public jit_emitter { public: - jit_broadcast_move_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_broadcast_move_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; private: size_t byte_size = 0lu; @@ -55,18 +67,23 @@ class jit_broadcast_move_emitter : public jit_emitter { class jit_scalar_emitter : public jit_emitter { public: - jit_scalar_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_scalar_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 0;} - size_t aux_gprs_count() const override {return 1;} + size_t get_inputs_num() const override { + return 0; + } + size_t aux_gprs_count() const override { + return 1; + } static int32_t read_value(const ov::snippets::lowered::ExpressionPtr& expr); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index fad1be5a5d1289..58a31a1804782a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -1,135 +1,57 @@ -// Copyright (C) 2020-2023 Intel Corporation +// Copyright (C) 2020-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "brgemm.hpp" -#include - #include "common/utils.hpp" #include "dnnl_extension_utils.h" -#include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" -#define DIM_CAST(X) static_cast(X) -#define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; -namespace { -size_t init_hash(dnnl_data_type_t dt_in0, dnnl_data_type_t dt_in1, bool is_with_amx, - bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t isa) { - size_t seed = 0; -#define HASH(X) seed = hash_combine(seed, X) - HASH(dt_in0); HASH(dt_in1); - HASH(is_with_amx); HASH(is_with_comp); - HASH(isa); -#undef HASH - return seed; -} -} // namespace - namespace ov { namespace intel_cpu { -BrgemmKernelConfig::BrgemmKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, - dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) : - m_static_params(std::make_shared(in0_dtype, in1_dtype, - is_with_amx, is_with_comp, - primitive_isa)) { - m_hash = compute_hash(); -} -bool BrgemmKernelConfig::is_completed() const { - return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty(); -} - -bool BrgemmKernelConfig::operator==(const BrgemmKernelConfig& rhs) const { -#define EQ(X) X == rhs.X - return EQ(m_hash) && EQ(m_beta) && - EQ(m_M) && EQ(m_N) && EQ(m_K) && - EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC) && - (EQ(m_static_params.get()) || *m_static_params == *(rhs.m_static_params)); -#undef EQ -} - -void BrgemmKernelConfig::update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta) { - // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) - // To process this case, we have to make this Config as empty (nullify runtime parameters) - if (utils::one_of(0, M, N, K)) { - m_M = 0; m_N = 0; m_K = 0; - m_LDA = 0; m_LDB = 0; m_LDC = 0; - m_beta = 0; - } else { - m_M = M; m_N = N; m_K = K; - m_LDA = LDA; m_LDB = LDB; m_LDC = LDC; - m_beta = beta; - } +BrgemmKernelConfig::BrgemmKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : BrgemmBaseKernelConfig(), + m_static_params(std::make_shared(in0_dtype, in1_dtype, is_with_comp, primitive_isa)) { m_hash = compute_hash(); } -bool BrgemmKernelConfig::is_empty() const { - return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta); -} - -BrgemmKernelConfig::operator amx_tile_config_t() const { - amx_tile_config_t res; - res.M = m_M; res.N = m_N; res.K = m_K; - return res; -} - -BrgemmKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, - dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) : - dt_in0(DTYPE_CAST(in0_dtype)), dt_in1(DTYPE_CAST(in1_dtype)), - is_with_amx(is_with_amx), is_with_comp(is_with_comp), - isa(primitive_isa), - hash(init_hash(dt_in0, dt_in1, is_with_amx, is_with_comp, isa)) { -} +BrgemmKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : StaticBaseParams(in0_dtype, in1_dtype, primitive_isa, compute_hash(is_with_comp)), + is_with_comp(is_with_comp) {} bool BrgemmKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { -#define EQ(X) X == rhs.X - return EQ(hash) && EQ(dt_in0) && EQ(dt_in1)&& EQ(is_with_amx) && EQ(is_with_comp) && EQ(isa); -#undef EQ + return StaticBaseParams::operator==(rhs) && is_with_comp == rhs.is_with_comp; } -size_t BrgemmKernelConfig::compute_hash() const { - size_t seed = m_static_params->hash; -#define HASH(X) seed = hash_combine(seed, X) - HASH(m_M); HASH(m_N); HASH(m_K); - HASH(m_LDA); HASH(m_LDB); HASH(m_LDC); - HASH(m_beta); -#undef HASH - return seed; + +size_t BrgemmKernelConfig::StaticParams::compute_hash(bool is_with_comp) { + return hash_combine(0, is_with_comp); } #ifdef SNIPPETS_DEBUG_CAPS -#define PRINT(X) ss << #X << " = " << X << "\n" std::string BrgemmKernelConfig::StaticParams::to_string() const { std::stringstream ss; - PRINT(dt_in0); PRINT(dt_in1); - PRINT(is_with_amx); PRINT(is_with_comp); - PRINT(isa); + ss << StaticBaseParams::to_string(); + ss << "is_with_comp = " << is_with_comp << "\n"; return ss.str(); } - -std::string BrgemmKernelConfig::to_string() const { - std::stringstream ss; - ss << m_static_params->to_string() << "\n"; - PRINT(m_M); PRINT(m_N); PRINT(m_K); - PRINT(m_LDA); PRINT(m_LDB); PRINT(m_LDC); - PRINT(m_beta); - return ss.str(); -} -#undef PRINT #endif -BrgemmKernelExecutor::BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) : - CPUKernelExecutor(std::move(kernel_cache), std::move(config)) { } - +BrgemmKernelExecutor::BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) + : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) {} std::shared_ptr BrgemmKernelExecutor::compile_kernel(const BrgemmKernelConfig& config) const { std::shared_ptr compiled_kernel = std::make_shared(); @@ -138,206 +60,54 @@ std::shared_ptr BrgemmKernelExecutor::compile_kernel(const if (config.is_empty()) return compiled_kernel; - cpu::x64::brgemm_desc_t desc; - auto status = brgemm_desc_init(&desc, config.get_isa(), cpu::x64::brgemm_strd, - config.get_dt_in0(), config.get_dt_in1(), - false, false, cpu::x64::brgemm_row_major, 1.f, - config.get_beta(), - config.get_LDA(), config.get_LDB(), config.get_LDC(), - config.get_M(), config.get_N(), config.get_K(), nullptr); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm descriptor due to invalid params"); - - if (config.is_with_amx()) { - status = brgemm_init_tiles(desc, compiled_kernel->palette); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm tiles due to invalid params"); - } - - cpu::x64::brgemm_kernel_t* kernel_ = nullptr; - status = brgemm_kernel_create(&kernel_, desc); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot create brgemm kernel due to invalid params"); - compiled_kernel->compiled_kernel = std::unique_ptr(kernel_); + create_brgemm_kernel(compiled_kernel->brgemm_kernel, + config.get_dt_in0(), + config.get_dt_in1(), + config.get_isa(), + config.get_M(), + config.get_N(), + config.get_K(), + config.get_LDA(), + config.get_LDB(), + config.get_LDC(), + config.get_beta()); return compiled_kernel; } -float BrgemmKernelExecutor::get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, - const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) { - // Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop. - // Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass). - // It means that previous executed Loops have Loop ID less the current Loop ID. - // - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have `beta = 1`. - // - If there is not this Loop -> the current executed Brgemm should have `beta = 0`. - if (loop_id > 0) { - const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info(); - // Check the previous Loops - --loop_id; - while (loop_id >= 0) { - const auto& expanded_loop_info = loop_manager->get_loop_info(loop_id); - if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) - return 0; - if (expanded_loop_info->get_work_amount() > 0) { - // there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1` - return 1; - } - --loop_id; - } - } - return 0; -} + void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, const ov::snippets::lowered::LinearIRCPtr& linear_ir, BrgemmKernelConfig& config) const { - const auto& input_pds = expr->get_input_port_descriptors(); - const auto& output_pds = expr->get_output_port_descriptors(); - OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1, - "Invalid number of in/out port descriptors"); - - const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout()); - const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout()); - auto in0_subtensor = input_pds[0]->get_subtensor(); - auto in1_subtensor = input_pds[1]->get_subtensor(); - - // Need to update M, K, N - // 1. If the original value in subtensor is `FULL_DIM`, it means that - // Brgemm block should process full tensor by this dim -> take dimension from shape - // 2. Otherwise, Brgemm block processes part of the tensor by this dim - // (there is blocking by this dimension) -> take from Loop increment - - auto M = *++in0_subtensor.rbegin(); - auto K = *in0_subtensor.rbegin(); - auto N = *in1_subtensor.rbegin(); - - size_t loop_idx = 0; - const auto& loop_ids = expr->get_loop_ids(); - const auto& loop_manager = linear_ir->get_loop_manager(); - auto get_loop_info = [&](){ - OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); - return loop_manager->get_loop_info(loop_ids[loop_idx++]); - }; - - /* ------- Dimension M ----------*/ - if (ov::snippets::utils::is_full_dim_value(M)) { - M = *++in0_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false` - // to avoid extra checks, we validate only first input port - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 1; }; - OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && - out_ports.size() == 1 && check_port(out_ports.back()), - "Incorrect Loop by Brgemm dimension M"); - M = current_expanded_loop_info->get_increment(); - input_pds[0]->set_subtensor_dim(1, M); - output_pds[0]->set_subtensor_dim(1, M); - } - - /* ------- Dimension N ----------*/ - if (ov::snippets::utils::is_full_dim_value(N)) { - N = *in1_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 0; }; - OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_incremented && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && - out_ports.size() == 1 && check_port(out_ports.back()), - "Incorrect Loop by Brgemm dimension N"); - N = current_expanded_loop_info->get_increment(); - input_pds[1]->set_subtensor_dim(0, N); - output_pds[0]->set_subtensor_dim(0, N); - } - - /* ------- Dimension K ----------*/ - // 1. If Brgemm block processes full dimension K -> `beta = 0` - // 2. If Brgemm block processes part of the dimension K (there is blocking), need to find - // the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0). - // First of them will have `beta = 0`, other - `beta = 1` - float beta = 0; - if (ov::snippets::utils::is_full_dim_value(K)) { - K = *in0_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && - out_ports.size() == 1 && !out_ports.front().is_incremented, - "Incorrect Loop by Brgemm dimension K"); - K = current_expanded_loop_info->get_increment(); - input_pds[0]->set_subtensor_dim(0, K); - input_pds[1]->set_subtensor_dim(1, K); - if (K > 0) - beta = get_beta(loop_manager, static_cast(loop_ids.back()), current_expanded_loop_info); - } - - const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); - const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); - auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); - const auto& brgemm_node = as_type_ptr(expr->get_node()); - OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config"); - // In case of data repacking LDB is chosen in accordance with repacking buffer size - if (with_repacking(brgemm_node->get_type())) - LDB = brgemm_utils::repacking::compute_LDB(LDB, brgemm_node->get_input_element_type(1)); - - config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); + return BrgemmBaseKernelExecutor::update_config(expr, linear_ir, config); } void BrgemmKernelExecutor::execute(const BrgemmKernelExecutor* executor, call_args* args) { + OV_CPU_JIT_EMITTER_ASSERT(executor, "has nullptr executor"); auto kernel = executor->get_kernel(); const auto& config = static_cast(executor->get_config()); OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr compiler kernel or invalid config"); - const auto tile_config = args->amx_tile_config; - if (config.is_with_amx() && tile_config && !config.compatible(tile_config)) { - *tile_config = static_cast(config); - cpu::x64::amx_tile_configure(kernel->palette); - } - - cpu::x64::brgemm_kernel_params_t brgemm_p; // Note: compensations should be applied only once, so we do it only on the first iteration, when beta == 0 - size_t is_with_comp = config.get_beta() == 0 && config.is_with_comp(); - - brgemm_p.batch = nullptr; // default value - brgemm_p.ptr_A = args->A; - brgemm_p.ptr_B = args->B; - brgemm_p.ptr_C = args->C; - brgemm_p.ptr_D = args->C; - brgemm_p.ptr_buf = args->scratch; - brgemm_p.ptr_bias = nullptr; - brgemm_p.do_post_ops = is_with_comp; - brgemm_p.do_apply_comp = is_with_comp; - brgemm_p.skip_accm = 0; - brgemm_p.BS = 1; // default value - OV_CPU_JIT_EMITTER_ASSERT(kernel->compiled_kernel, "has nullptr kernel"); - (*kernel->compiled_kernel)(&brgemm_p); + const auto is_with_comp = config.get_beta() == 0 && config.is_with_comp(); + execute_brgemm_kernel(kernel->brgemm_kernel, args->A, args->B, args->C, args->scratch, is_with_comp); } #ifdef SNIPPETS_DEBUG_CAPS -BrgemmKernelReferenceExecutor::BrgemmKernelReferenceExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) : - BrgemmKernelExecutor(std::move(kernel_cache), std::move(config)) { -} +BrgemmKernelReferenceExecutor::BrgemmKernelReferenceExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, + BrgemmKernelConfig config) + : BrgemmKernelExecutor(std::move(kernel_cache), std::move(config)) {} std::shared_ptr BrgemmKernelReferenceExecutor::compile_kernel(const BrgemmKernelConfig& c) const { const auto& res = std::make_shared(); - res->compiled_kernel.reset(new brgemm_ref_kernel(c)); + res->brgemm_kernel.reset(new brgemm_ref_kernel(c)); return res; } brgemm_ref_kernel::brgemm_ref_kernel(BrgemmKernelConfig c) : m_config(std::move(c)) { - OV_CPU_JIT_EMITTER_ASSERT(!m_config.is_with_comp() && !m_config.is_with_amx(), - "brgemm_ref_kernel doesn't currently support compensations or amx"); - OV_CPU_JIT_EMITTER_ASSERT(m_config.get_dt_in0() == m_config.get_dt_in1() && - m_config.get_dt_in0() == dnnl_data_type_t::dnnl_f32, - "brgemm_ref_kernel currently supports only fp32 inputs"); + OV_CPU_JIT_EMITTER_ASSERT(!m_config.is_with_comp(), "brgemm_ref_kernel doesn't currently support compensations"); + OV_CPU_JIT_EMITTER_ASSERT( + m_config.get_dt_in0() == m_config.get_dt_in1() && m_config.get_dt_in0() == dnnl_data_type_t::dnnl_f32, + "brgemm_ref_kernel currently supports only fp32 inputs"); } void brgemm_ref_kernel::operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t* args) const { @@ -357,5 +127,5 @@ void brgemm_ref_kernel::operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t* } #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp index 2549580c1a176c..9cc17049c4d3ae 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp @@ -1,96 +1,72 @@ -// Copyright (C) 2020-2023 Intel Corporation +// Copyright (C) 2020-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" -#include "emitters/snippets/cpu_kernel_executor_table.hpp" -#include - -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/loop_info.hpp" +#include "brgemm_base.hpp" namespace ov { namespace intel_cpu { -struct BrgemmKernelConfig : public snippets::KernelExecutorBase::GenericConfig { + +struct BrgemmKernelConfig : public BrgemmBaseKernelConfig { public: - BrgemmKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + BrgemmKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); BrgemmKernelConfig() = delete; - bool is_completed() const override; - size_t hash() const override { return m_hash; } - bool operator==(const BrgemmKernelConfig& rhs) const; - bool operator!=(const BrgemmKernelConfig& rhs) const {return !(*this == rhs);} - std::unique_ptr get_clone_ptr() const override { - return std::unique_ptr( new BrgemmKernelConfig(*this)); - } - void update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta); - bool is_empty() const; - - dnnl_data_type_t get_dt_in0() const { return m_static_params->dt_in0; } - dnnl_data_type_t get_dt_in1() const { return m_static_params->dt_in1; } - - dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { return m_static_params->isa; } - bool is_with_amx() const {return m_static_params->is_with_amx; } - bool is_with_comp() const { return m_static_params->is_with_comp; } - float get_beta() const { return m_beta; } - - dnnl_dim_t get_M() const { return m_M; } - dnnl_dim_t get_N() const { return m_N; } - dnnl_dim_t get_K() const { return m_K; } - dnnl_dim_t get_LDA() const { return m_LDA; } - dnnl_dim_t get_LDB() const { return m_LDB; } - dnnl_dim_t get_LDC() const { return m_LDC; } - - explicit operator amx_tile_config_t() const; - inline bool compatible(amx_tile_config_t* rhs) const { - return rhs && rhs->M == m_M && rhs->N == m_N && rhs->K == m_K; + std::unique_ptr get_clone_ptr() const override { + return std::unique_ptr(new BrgemmKernelConfig(*this)); } -#ifdef SNIPPETS_DEBUG_CAPS - std::string to_string() const override; -#endif + bool is_with_comp() const { + return m_static_params->is_with_comp; + } private: - struct StaticParams { - StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); - const dnnl_data_type_t dt_in0 {dnnl_f32}, dt_in1 {dnnl_f32}; - const bool is_with_amx {false}; - const bool is_with_comp {false}; - const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::isa_undef}; - const size_t hash {0}; + struct StaticParams : StaticBaseParams { + StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + + const bool is_with_comp{false}; + bool operator==(const StaticParams& rhs) const; - bool operator!=(const StaticParams& rhs) const { return !(*this == rhs); } + bool operator!=(const StaticParams& rhs) const { + return !(*this == rhs); + } #ifdef SNIPPETS_DEBUG_CAPS std::string to_string() const; #endif + private: + static size_t compute_hash(bool is_with_comp); }; - size_t compute_hash() const; - std::shared_ptr m_static_params; - dnnl_dim_t m_M {0}, m_N {0}, m_K {0}, m_LDA {0}, m_LDB {0}, m_LDC {0}; - float m_beta {0}; - size_t m_hash {SIZE_MAX}; + + std::shared_ptr get_static_params() const override { + return m_static_params; + } + + std::shared_ptr m_static_params{nullptr}; }; +// The `update_kernel` method verifies that a compiled kernel is not nullptr. +// However, the compiled kernel might be empty in cases if nothing is to be compiled (`Config.is_empty() == true`). +// To cover this case, we wrap the `brgemm_kernel_t` in the separate structure which may contain empty `brgemm_kernel_t` struct BrgemmCompiledKernel { - std::unique_ptr compiled_kernel = nullptr; - // Note: Palette is treated as a part of a kernel because it is initialized during the kernel compilation stage. - // Each kernel need to store the pallet it was compiled with. - char palette[64] = {}; + std::shared_ptr brgemm_kernel = nullptr; }; -class BrgemmKernelExecutor : public CPUKernelExecutor { +class BrgemmKernelExecutor : public BrgemmBaseKernelExecutor, + public CPUKernelExecutor { public: struct call_args { const void* A = nullptr; const void* B = nullptr; void* C = nullptr; void* scratch = nullptr; - amx_tile_config_t* amx_tile_config = nullptr; }; BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config); @@ -99,12 +75,10 @@ class BrgemmKernelExecutor : public CPUKernelExecutor compile_kernel(const BrgemmKernelConfig& c) const override; + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, const ov::snippets::lowered::LinearIRCPtr& linear_ir, BrgemmKernelConfig& config) const override; - - static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, - const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info); }; #define GET_OFF_BRGEMM_ARGS(field) offsetof(BrgemmKernelExecutor::call_args, field) @@ -113,20 +87,25 @@ class BrgemmKernelReferenceExecutor : public BrgemmKernelExecutor { public: BrgemmKernelReferenceExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config); using BrgemmKernelExecutor::execute; + protected: std::shared_ptr compile_kernel(const BrgemmKernelConfig& c) const override; }; + struct brgemm_ref_kernel : public dnnl::impl::cpu::x64::brgemm_kernel_t { brgemm_ref_kernel(BrgemmKernelConfig c); - void operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t *) const override; - dnnl_status_t create_kernel() override { return dnnl_status_t::dnnl_success; } - const dnnl::impl::cpu::x64::jit_generator *get_jit_generator() const override { + void operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t*) const override; + dnnl_status_t create_kernel() override { + return dnnl_status_t::dnnl_success; + } + const dnnl::impl::cpu::x64::jit_generator* get_jit_generator() const override { OV_CPU_JIT_EMITTER_THROW("get_jit_generator should not be called for reference kernel"); return nullptr; } + private: BrgemmKernelConfig m_config; }; #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp new file mode 100644 index 00000000000000..12c52d43b2c4b8 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp @@ -0,0 +1,317 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_amx.hpp" + +#include + +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +#define INNER_K_BLK(dtype) static_cast((brgemm_utils::repacking::compute_inner_k_block(in0_dtype))) +#define VNNI_FACTOR(dtype) static_cast((brgemm_utils::compute_vnni_factor(in0_dtype))) +#define EQ(X) X == rhs.X +#define HASH(X) seed = hash_combine(seed, X) + +using namespace Xbyak; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; + +namespace ov { +namespace intel_cpu { + +BrgemmAMXKernelConfig::BrgemmAMXKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : BrgemmBaseKernelConfig(), + m_static_params(std::make_shared(in0_dtype, in1_dtype, primitive_isa)) { + m_hash = compute_hash(); +} + +BrgemmAMXKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : StaticBaseParams(in0_dtype, + in1_dtype, + primitive_isa, + compute_hash(INNER_K_BLK(in0_dtype), VNNI_FACTOR(in0_dtype))), + inner_k_blk(INNER_K_BLK(in0_dtype)), + vnni_factor(VNNI_FACTOR(in0_dtype)) {} + +bool BrgemmAMXKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { + return StaticBaseParams::operator==(rhs) && EQ(inner_k_blk) && EQ(vnni_factor); +} + +size_t BrgemmAMXKernelConfig::StaticParams::compute_hash(dnnl_dim_t inner_k_blk, dnnl_dim_t vnni_factor) { + size_t seed = 0; + HASH(inner_k_blk); + HASH(vnni_factor); + return seed; +} + +bool BrgemmAMXKernelConfig::need_copy_a(dnnl_dim_t K) const { + return K % get_vnni_factor() > 0; +} + +#ifdef SNIPPETS_DEBUG_CAPS +std::string BrgemmAMXKernelConfig::StaticParams::to_string() const { + std::stringstream ss; + ss << StaticBaseParams::to_string(); + ss << "inner_k_blk = " << inner_k_blk << "\n"; + ss << "vnni_factor = " << vnni_factor << "\n"; + return ss.str(); +} +#endif + +BrgemmAMXKernelExecutor::BrgemmAMXKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, + BrgemmAMXKernelConfig config) + : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) {} + +namespace { +struct BrgemmCopyAKey { + BrgemmCopyAKey(cpu_isa_t isa, + dnnl_data_type_t dt, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t K_tail, + dnnl_dim_t src_stride, + dnnl_dim_t LDA) + : isa(isa), + dt(dt), + K{K}, + K_blk{K_blk}, + K_tail{K_tail}, + src_stride{src_stride}, + LDA{LDA} {} + + size_t hash() const { + size_t seed = 0; + HASH(isa); + HASH(dt); + HASH(K); + HASH(K_blk); + HASH(K_tail); + HASH(src_stride); + HASH(LDA); + return seed; + } + bool operator==(const BrgemmCopyAKey& rhs) const { + return EQ(isa) && EQ(dt) && EQ(K) && EQ(K_blk) && EQ(K_tail) && EQ(src_stride) && EQ(LDA); + } + + cpu_isa_t isa{cpu_isa_t::isa_undef}; + dnnl_data_type_t dt{dnnl_data_type_t::dnnl_data_type_undef}; + dnnl_dim_t K{0}, K_blk{0}, K_tail{0}, src_stride{0}, LDA{0}; +}; +} // namespace + +std::shared_ptr BrgemmAMXKernelExecutor::compile_kernel( + const BrgemmAMXKernelConfig& config) const { + std::shared_ptr compiled_kernel = std::make_shared(); + + // Brgemm is not executable - nothing to compile + if (config.is_empty()) + return compiled_kernel; + + const auto& cache = m_kernel_cache.lock(); + OPENVINO_ASSERT(cache, "Invalid kernel cache pointer in BrgemmAMXKernelExecutor::compile_kernel()"); + + auto brgemm_key = [&config](dnnl_dim_t K, dnnl_dim_t LDA, float beta) { + auto key = config; + key.update(config.get_M(), config.get_N(), K, LDA, config.get_LDB(), config.get_LDC(), beta); + return key; + }; + + auto brgemm_builder = [](const BrgemmAMXKernelConfig& k) { + std::shared_ptr ker = + std::make_shared(); + create_brgemm_kernel(ker->brgemm_kernel, + k.get_dt_in0(), + k.get_dt_in1(), + k.get_isa(), + k.get_M(), + k.get_N(), + k.get_K(), + k.get_LDA(), + k.get_LDB(), + k.get_LDC(), + k.get_beta(), + true, + ker->palette); + return ker; + }; + + auto brgemm_copy_a_builder = [](const BrgemmCopyAKey& k) { + std::shared_ptr ker{nullptr}; + create_brgemm_copy_a_kernel(ker, k.isa, k.dt, k.K, k.K_blk, k.K_tail, k.src_stride, k.LDA); + return ker; + }; + + auto K_tail = config.get_K() % config.get_inner_K_blk(); + auto K_body = config.get_K() - K_tail; + + float beta = config.get_beta(); + + // Brgemm Kernel for K_body + if (K_body != 0) { + const auto result = cache->getOrCreate(brgemm_key(K_body, config.get_LDA(), beta), brgemm_builder); + compiled_kernel->K_body_kernel = result.first; + beta = 1; + } + + // Brgemm Kernel for K_tail with BrgemmCopyA if needed + if (K_tail != 0) { + auto LDA = config.get_LDA(); + if (config.need_copy_a(K_tail)) { + const auto copy_A_src_stride = LDA * dnnl_data_type_size(config.get_dt_in0()); + K_tail = ov::snippets::utils::rnd_up(K_tail, config.get_vnni_factor()); + LDA = K_tail; + + const auto key = BrgemmCopyAKey(config.get_isa(), + config.get_dt_in0(), + config.get_K(), + config.get_inner_K_blk(), + K_tail, + copy_A_src_stride, + LDA); + const auto result = cache->getOrCreate(key, brgemm_copy_a_builder); + compiled_kernel->brgemm_copy_a_kernel = result.first; + } + + const auto result = cache->getOrCreate(brgemm_key(K_tail, LDA, beta), brgemm_builder); + compiled_kernel->K_tail_kernel = result.first; + } + + return compiled_kernel; +} + +void BrgemmAMXKernelExecutor::create_brgemm_copy_a_kernel( + std::shared_ptr& kernel, + dnnl::impl::cpu::x64::cpu_isa_t isa, + dnnl_data_type_t dt, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t K_tail, + dnnl_dim_t src_stride, + dnnl_dim_t LDA) { + matmul::brgemm_matmul_conf_t conf_; + conf_.src_tag = dnnl_abcd; // unused + conf_.K = K; + conf_.K_tail = K_tail; + conf_.K_blk = K_blk; + conf_.use_buffer_a_tail_only = false; + conf_.LDA = LDA; + conf_.has_zero_point_b = false; + conf_.s8s8_compensation_required = false; + conf_.wei_zp_type = dnnl::impl::cpu::x64::none; + conf_.src_zp_type = dnnl::impl::cpu::x64::none; + conf_.src_dt = dt; + conf_.copy_A_src_stride = src_stride; + conf_.a_dt_sz = dnnl_data_type_size(conf_.src_dt); + // copied A has the same precision of original + conf_.tr_a_dt_sz = dnnl_data_type_size(conf_.src_dt); + conf_.transposed_A = false; + conf_.isa = isa; + + std::unique_ptr brgemm_matmul_copy_a = nullptr; + OV_CPU_JIT_EMITTER_ASSERT(create_brgemm_matmul_copy_a(brgemm_matmul_copy_a, &conf_) == dnnl_success, + "Cannot create brgemm copy a kernel due to invalid params"); + kernel = std::move(brgemm_matmul_copy_a); +} + +void BrgemmAMXKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmAMXKernelConfig& config) const { + return BrgemmBaseKernelExecutor::update_config(expr, linear_ir, config); +} + +void BrgemmAMXKernelExecutor::configure_tiles_if_needed(amx_tile_config_t* config, + const char* palette, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K) { + auto compatible = [&](amx_tile_config_t* rhs) { + return rhs && rhs->M == M && rhs->N == N && rhs->K == K; + }; + if (config && !compatible(config)) { + config->M = M; + config->N = N; + config->K = K; + cpu::x64::amx_tile_configure(palette); + } +} + +void BrgemmAMXKernelExecutor::execute_brgemm_copy_a_kernel( + const std::shared_ptr& kernel, + const void* src, + const void* tr_src, + dnnl_dim_t M, + dnnl_dim_t K) { + auto ctx = matmul::jit_brgemm_matmul_copy_a_t::ctx_t(); + + ctx.current_M_blk = M; + ctx.zp_b_compensation_buffer_ptr = nullptr; + ctx.zp_a_compensation_result_ptr = nullptr; + ctx.zp_b_neg_value_ptr = nullptr; + ctx.zp_ab_comp_ptr = nullptr; + ctx.src = src; + ctx.tr_src = tr_src; + ctx.current_K_start = 0; + ctx.current_K_blk = K; + + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr brgemm_copy_a_kernel"); + (*kernel)(&ctx); +} + +void BrgemmAMXKernelExecutor::execute(const BrgemmAMXKernelExecutor* executor, call_args* args) { + OV_CPU_JIT_EMITTER_ASSERT(executor, "has nullptr executor"); + auto kernel = executor->get_kernel(); + const auto& config = static_cast(executor->get_config()); + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr compiler kernel or invalid config"); + + const auto* src_ptr = args->A; + const auto* wei_ptr = args->B; + auto* scratch = args->scratch; + + const auto K_tail = config.get_K() % config.get_inner_K_blk(); + const auto K_body = config.get_K() - K_tail; + + if (K_body != 0) { + const auto& K_body_kernel = kernel->K_body_kernel; + configure_tiles_if_needed(args->amx_tile_config, + K_body_kernel->palette, + config.get_M(), + config.get_N(), + K_body); + execute_brgemm_kernel(K_body_kernel->brgemm_kernel, src_ptr, wei_ptr, args->C, scratch, false); + + src_ptr = src_ptr + K_body * dnnl_data_type_size(config.get_dt_in0()); + wei_ptr = wei_ptr + (K_body * config.get_LDB()) * dnnl_data_type_size(config.get_dt_in1()); + } + + if (K_tail != 0) { + if (config.need_copy_a(K_tail)) { + auto* tr_src = scratch + BrgemmCPU::SCRATCH_BYTE_SIZE; + + execute_brgemm_copy_a_kernel(kernel->brgemm_copy_a_kernel, src_ptr, tr_src, config.get_M(), K_tail); + src_ptr = tr_src; + } + + const auto& K_tail_kernel = kernel->K_tail_kernel; + configure_tiles_if_needed(args->amx_tile_config, + K_tail_kernel->palette, + config.get_M(), + config.get_N(), + K_tail); + execute_brgemm_kernel(K_tail_kernel->brgemm_kernel, src_ptr, wei_ptr, args->C, scratch, false); + } +} + +#undef INNER_K_BLK +#undef VNNI_FACTOR +#undef EQ +#undef HASH + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp new file mode 100644 index 00000000000000..733295ec995583 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp @@ -0,0 +1,126 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "brgemm_base.hpp" +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" + +namespace ov { +namespace intel_cpu { + +struct BrgemmAMXKernelConfig : public BrgemmBaseKernelConfig { +public: + BrgemmAMXKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + BrgemmAMXKernelConfig() = delete; + + std::unique_ptr get_clone_ptr() const override { + return std::unique_ptr(new BrgemmAMXKernelConfig(*this)); + } + + dnnl_dim_t get_inner_K_blk() const { + return m_static_params->inner_k_blk; + } + dnnl_dim_t get_vnni_factor() const { + return m_static_params->vnni_factor; + } + + bool need_copy_a(dnnl_dim_t K) const; + +private: + struct StaticParams : StaticBaseParams { + StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + + const dnnl_dim_t inner_k_blk{0}; + const dnnl_dim_t vnni_factor{0}; + + bool operator==(const StaticParams& rhs) const; + bool operator!=(const StaticParams& rhs) const { + return !(*this == rhs); + } +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const; +#endif + private: + static size_t compute_hash(dnnl_dim_t inner_k_blk, dnnl_dim_t vnni_factor); + }; + + std::shared_ptr get_static_params() const override { + return m_static_params; + } + + std::shared_ptr m_static_params{nullptr}; +}; + +struct BrgemmAMXCompiledKernel { + struct BrgemmKernel { + std::shared_ptr brgemm_kernel{nullptr}; + // Note: Palette is treated as a part of a kernel because it is initialized during the kernel compilation stage. + // Each kernel need to store the pallet it was compiled with. + char palette[64] = {}; + }; + + std::shared_ptr K_body_kernel{nullptr}; + std::shared_ptr K_tail_kernel{nullptr}; + std::shared_ptr brgemm_copy_a_kernel{nullptr}; +}; + +class BrgemmAMXKernelExecutor : public BrgemmBaseKernelExecutor, + public CPUKernelExecutor { +public: + struct call_args { + const uint8_t* A = nullptr; + const uint8_t* B = nullptr; + void* C = nullptr; + uint8_t* scratch = nullptr; + amx_tile_config_t* amx_tile_config = nullptr; + }; + BrgemmAMXKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmAMXKernelConfig config); + + /** Function that will be called in runtime to execute the kernel */ + static void execute(const BrgemmAMXKernelExecutor* executor, call_args* args); + +protected: + std::shared_ptr compile_kernel(const BrgemmAMXKernelConfig& c) const override; + + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmAMXKernelConfig& config) const override; + + static void configure_tiles_if_needed(amx_tile_config_t* config, + const char* palette, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K); + + static void create_brgemm_copy_a_kernel( + std::shared_ptr& kernel, + dnnl::impl::cpu::x64::cpu_isa_t isa, + dnnl_data_type_t dt, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t K_tail, + dnnl_dim_t src_stride, + dnnl_dim_t LDA); + + static void execute_brgemm_copy_a_kernel( + const std::shared_ptr& kernel, + const void* src, + const void* tr_src, + dnnl_dim_t M, + dnnl_dim_t K); +}; +#define GET_OFF_BRGEMM_AMX_ARGS(field) offsetof(BrgemmAMXKernelExecutor::call_args, field) + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp new file mode 100644 index 00000000000000..008237780de3f6 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp @@ -0,0 +1,342 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_base.hpp" + +#include "common/utils.hpp" +#include "dnnl_extension_utils.h" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +#define DIM_CAST(X) static_cast(X) +#define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) +#define PRINT(X) ss << #X << " = " << X << "\n" +#define EQ(X) X == rhs.X +#define HASH(X) seed = hash_combine(seed, X) + +using namespace Xbyak; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; + +namespace ov { +namespace intel_cpu { + +bool BrgemmBaseKernelConfig::is_completed() const { + return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty(); +} + +bool BrgemmBaseKernelConfig::is_empty() const { + return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta); +} + +bool BrgemmBaseKernelConfig::operator==(const BrgemmBaseKernelConfig& rhs) const { + return EQ(m_hash) && EQ(m_beta) && EQ(m_M) && EQ(m_N) && EQ(m_K) && EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC) && + (EQ(get_static_params()) || *get_static_params() == *(rhs.get_static_params())); +} + +void BrgemmBaseKernelConfig::update(dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K, + dnnl_dim_t LDA, + dnnl_dim_t LDB, + dnnl_dim_t LDC, + float beta) { + // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) + // To process this case, we have to make this Config as empty (nullify runtime parameters) + if (utils::one_of(0, M, N, K)) { + m_M = 0; + m_N = 0; + m_K = 0; + m_LDA = 0; + m_LDB = 0; + m_LDC = 0; + m_beta = 0; + } else { + m_M = M; + m_N = N; + m_K = K; + m_LDA = LDA; + m_LDB = LDB; + m_LDC = LDC; + m_beta = beta; + } + m_hash = compute_hash(); +} + +size_t BrgemmBaseKernelConfig::compute_hash() const { + size_t seed = get_static_params()->hash(); + HASH(m_M); + HASH(m_N); + HASH(m_K); + HASH(m_LDA); + HASH(m_LDB); + HASH(m_LDC); + HASH(m_beta); + return seed; +} + +BrgemmBaseKernelConfig::StaticBaseParams::StaticBaseParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + cpu_isa_t primitive_isa, + size_t hash_seed) + : dt_in0(DTYPE_CAST(in0_dtype)), + dt_in1(DTYPE_CAST(in1_dtype)), + isa(primitive_isa), + m_hash(compute_hash(hash_seed, dt_in0, dt_in1, isa)) {} + +bool BrgemmBaseKernelConfig::StaticBaseParams::operator==(const StaticBaseParams& rhs) const { + return EQ(hash()) && EQ(dt_in0) && EQ(dt_in1) && EQ(isa); +} + +size_t BrgemmBaseKernelConfig::StaticBaseParams::compute_hash(size_t hash_seed, + dnnl_data_type_t dt_in0, + dnnl_data_type_t dt_in1, + cpu_isa_t isa) { + size_t seed = hash_seed; + HASH(dt_in0); + HASH(dt_in1); + HASH(isa); + return seed; +} + +#ifdef SNIPPETS_DEBUG_CAPS +std::string BrgemmBaseKernelConfig::StaticBaseParams::to_string() const { + std::stringstream ss; + PRINT(dt_in0); + PRINT(dt_in1); + PRINT(isa); + return ss.str(); +} + +std::string BrgemmBaseKernelConfig::to_string() const { + std::stringstream ss; + ss << get_static_params()->to_string() << "\n"; + PRINT(m_M); + PRINT(m_N); + PRINT(m_K); + PRINT(m_LDA); + PRINT(m_LDB); + PRINT(m_LDC); + PRINT(m_beta); + return ss.str(); +} +#endif + +float BrgemmBaseKernelExecutor::get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, + int loop_id, + const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) { + // Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop. + // Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass). + // It means that previous executed Loops have Loop ID less the current Loop ID. + // - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have + // `beta = 1`. + // - If there is not this Loop -> the current executed Brgemm should have `beta = 0`. + if (loop_id > 0) { + const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info(); + // Check the previous Loops + --loop_id; + while (loop_id >= 0) { + const auto& expanded_loop_info = + loop_manager->get_loop_info(loop_id); + if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) + return 0; + if (expanded_loop_info->get_work_amount() > 0) { + // there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1` + return 1; + } + --loop_id; + } + } + return 0; +} + +void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmBaseKernelConfig& config) { + const auto& input_pds = expr->get_input_port_descriptors(); + const auto& output_pds = expr->get_output_port_descriptors(); + OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1, + "Invalid number of in/out port descriptors"); + + const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout()); + const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout()); + auto in0_subtensor = input_pds[0]->get_subtensor(); + auto in1_subtensor = input_pds[1]->get_subtensor(); + + // Need to update M, K, N + // 1. If the original value in subtensor is `FULL_DIM`, it means that + // Brgemm block should process full tensor by this dim -> take dimension from shape + // 2. Otherwise, Brgemm block processes part of the tensor by this dim + // (there is blocking by this dimension) -> take from Loop increment + + auto M = *++in0_subtensor.rbegin(); + auto K = *in0_subtensor.rbegin(); + auto N = *in1_subtensor.rbegin(); + + size_t loop_idx = 0; + const auto& loop_ids = expr->get_loop_ids(); + const auto& loop_manager = linear_ir->get_loop_manager(); + auto get_loop_info = [&]() { + OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); + return loop_manager->get_loop_info(loop_ids[loop_idx++]); + }; + + /* ------- Dimension M ----------*/ + if (ov::snippets::utils::is_full_dim_value(M)) { + M = *++in0_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false` + // to avoid extra checks, we validate only first input port + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { + return p.dim_idx == 1; + }; + OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && + out_ports.size() == 1 && check_port(out_ports.back()), + "Incorrect Loop by Brgemm dimension M"); + M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[0]->set_subtensor_dim(1, M); + output_pds[0]->set_subtensor_dim(1, M); + } + + /* ------- Dimension N ----------*/ + if (ov::snippets::utils::is_full_dim_value(N)) { + N = *in1_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { + return p.dim_idx == 0; + }; + OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_incremented && + std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && out_ports.size() == 1 && + check_port(out_ports.back()), + "Incorrect Loop by Brgemm dimension N"); + N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[1]->set_subtensor_dim(0, N); + output_pds[0]->set_subtensor_dim(0, N); + } + + /* ------- Dimension K ----------*/ + // 1. If Brgemm block processes full dimension K -> `beta = 0` + // 2. If Brgemm block processes part of the dimension K (there is blocking), need to find + // the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0). + // First of them will have `beta = 0`, other - `beta = 1` + float beta = 0; + if (ov::snippets::utils::is_full_dim_value(K)) { + K = *in0_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && + out_ports.size() == 1 && !out_ports.front().is_incremented, + "Incorrect Loop by Brgemm dimension K"); + K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[0]->set_subtensor_dim(0, K); + input_pds[1]->set_subtensor_dim(1, K); + if (K > 0) + beta = get_beta(loop_manager, static_cast(loop_ids.back()), current_expanded_loop_info); + } + + const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); + const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); + auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); + + const auto& brgemm_node = as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config"); + // In case of data repacking LDB is chosen in accordance with repacking buffer size + if (with_repacking(brgemm_node->get_type())) + LDB = DIM_CAST(brgemm_utils::repacking::compute_LDB(LDB, brgemm_node->get_input_element_type(1))); + + config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); +} + +void BrgemmBaseKernelExecutor::create_brgemm_kernel(std::shared_ptr& kernel, + dnnl_data_type_t dt0, + dnnl_data_type_t dt1, + cpu_isa_t isa, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K, + dnnl_dim_t LDA, + dnnl_dim_t LDB, + dnnl_dim_t LDC, + float beta, + bool with_amx, + char* palette) { + cpu::x64::brgemm_desc_t desc; + OV_CPU_JIT_EMITTER_ASSERT(brgemm_desc_init(&desc, + isa, + cpu::x64::brgemm_strd, + dt0, + dt1, + false, + false, + cpu::x64::brgemm_row_major, + 1.f, + beta, + LDA, + LDB, + LDC, + M, + N, + K, + nullptr) == dnnl_success, + "Cannot initialize brgemm descriptor due to invalid params"); + + if (with_amx) { + OV_CPU_JIT_EMITTER_ASSERT(palette && brgemm_init_tiles(desc, palette) == dnnl_success, + "Cannot initialize brgemm tiles due to invalid params"); + } + + cpu::x64::brgemm_kernel_t* kernel_ = nullptr; + OV_CPU_JIT_EMITTER_ASSERT(brgemm_kernel_create(&kernel_, desc) == dnnl_success, + "Cannot create brgemm kernel due to invalid params"); + kernel = std::unique_ptr(kernel_); +} + +void BrgemmBaseKernelExecutor::execute_brgemm_kernel( + const std::shared_ptr& kernel, + const void* src, + const void* wei, + void* dst, + void* scratch, + bool with_comp) { + cpu::x64::brgemm_kernel_params_t brgemm_p; + brgemm_p.batch = nullptr; // default value + brgemm_p.ptr_A = src; + brgemm_p.ptr_B = wei; + brgemm_p.ptr_C = dst; + brgemm_p.ptr_D = dst; + brgemm_p.ptr_buf = scratch; + brgemm_p.ptr_bias = nullptr; + brgemm_p.do_post_ops = with_comp; + brgemm_p.do_apply_comp = with_comp; + brgemm_p.skip_accm = 0; + brgemm_p.BS = 1; // default value + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr Brgemm kernel"); + (*kernel)(&brgemm_p); +} + +#undef DIM_CAST +#undef DTYPE_CAST +#undef PRINT +#undef EQ +#undef HASH + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp new file mode 100644 index 00000000000000..674ea42522230b --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp @@ -0,0 +1,150 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "cpu/x64/cpu_isa_traits.hpp" +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "openvino/core/type/element_type.hpp" +#include "snippets/lowered/loop_info.hpp" +#include "snippets/lowered/loop_manager.hpp" + +namespace ov { +namespace intel_cpu { + +struct BrgemmBaseKernelConfig : public snippets::KernelExecutorBase::GenericConfig { +public: + BrgemmBaseKernelConfig() = default; + + bool is_completed() const override; + size_t hash() const override { + return m_hash; + } + + bool is_empty() const; + void update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta); + + bool operator==(const BrgemmBaseKernelConfig& rhs) const; + bool operator!=(const BrgemmBaseKernelConfig& rhs) const { + return !(*this == rhs); + } + + dnnl_data_type_t get_dt_in0() const { + return get_static_params()->dt_in0; + } + dnnl_data_type_t get_dt_in1() const { + return get_static_params()->dt_in1; + } + + dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { + return get_static_params()->isa; + } + float get_beta() const { + return m_beta; + } + + dnnl_dim_t get_M() const { + return m_M; + } + dnnl_dim_t get_N() const { + return m_N; + } + dnnl_dim_t get_K() const { + return m_K; + } + + dnnl_dim_t get_LDA() const { + return m_LDA; + } + dnnl_dim_t get_LDB() const { + return m_LDB; + } + dnnl_dim_t get_LDC() const { + return m_LDC; + } + +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const override; +#endif + +protected: + struct StaticBaseParams { + StaticBaseParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa, + size_t hash_seed); + virtual ~StaticBaseParams() = default; + + const dnnl_data_type_t dt_in0{dnnl_f32}, dt_in1{dnnl_f32}; + const dnnl::impl::cpu::x64::cpu_isa_t isa{dnnl::impl::cpu::x64::isa_undef}; + + size_t hash() const { + return m_hash; + } + + bool operator==(const StaticBaseParams& rhs) const; + bool operator!=(const StaticBaseParams& rhs) const { + return !(*this == rhs); + } +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const; +#endif + protected: + static size_t compute_hash(size_t hash_seed, + dnnl_data_type_t dt_in0, + dnnl_data_type_t dt_in1, + dnnl::impl::cpu::x64::cpu_isa_t isa); + + const size_t m_hash{0}; + }; + + virtual std::shared_ptr get_static_params() const = 0; + size_t compute_hash() const; + + dnnl_dim_t m_M{0}, m_N{0}, m_K{0}, m_LDA{0}, m_LDB{0}, m_LDC{0}; + float m_beta{0}; + size_t m_hash{SIZE_MAX}; +}; + +class BrgemmBaseKernelExecutor { +public: + virtual ~BrgemmBaseKernelExecutor() = default; + +protected: + static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, + int loop_id, + const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info); + + static void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmBaseKernelConfig& config); + + static void create_brgemm_kernel(std::shared_ptr& kernel, + dnnl_data_type_t dt0, + dnnl_data_type_t dt1, + dnnl::impl::cpu::x64::cpu_isa_t isa, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K, + dnnl_dim_t LDA, + dnnl_dim_t LDB, + dnnl_dim_t LDC, + float beta, + bool with_amx = false, + char* palette = nullptr); + + static void execute_brgemm_kernel(const std::shared_ptr& kernel, + const void* src, + const void* wei, + void* dst, + void* scratch, + bool with_comp); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp index cc79458c7c4c64..dd216517ace12e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp @@ -4,8 +4,8 @@ #include "brgemm_copy_b.hpp" -#include "snippets/lowered/loop_manager.hpp" #include "emitters/plugin/x64/utils.hpp" +#include "snippets/lowered/loop_manager.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" #define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) @@ -16,8 +16,12 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -BrgemmCopyBKernelConfig::BrgemmCopyBKernelConfig(const element::Type& src_dt, const element::Type& wei_dt, cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk) +BrgemmCopyBKernelConfig::BrgemmCopyBKernelConfig(const element::Type& src_dt, + const element::Type& wei_dt, + cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk) : m_static_params(std::make_shared(src_dt, wei_dt, isa, is_with_comp, is_transposed_B, wei_N_blk)) { m_hash = compute_hash(); } @@ -37,17 +41,28 @@ bool BrgemmCopyBKernelConfig::operator==(const BrgemmCopyBKernelConfig& rhs) con #undef EQ } -void BrgemmCopyBKernelConfig::update(dnnl_dim_t N, dnnl_dim_t N_blk, dnnl_dim_t K, dnnl_dim_t K_blk, dnnl_dim_t copy_B_wei_stride, dnnl_dim_t LDB) { - // If one of the dims is zero, it means that BrgemmCopyB won't be executed (in Loop with work_amount = 0, for example) - // To process this case, we have to make this Config as empty (nullify runtime parameters) +void BrgemmCopyBKernelConfig::update(dnnl_dim_t N, + dnnl_dim_t N_blk, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t copy_B_wei_stride, + dnnl_dim_t LDB) { + // If one of the dims is zero, it means that BrgemmCopyB won't be executed (in Loop with work_amount = 0, for + // example) To process this case, we have to make this Config as empty (nullify runtime parameters) if (utils::one_of(0, N, K)) { - m_N = 0; m_N_blk = 0; - m_K = 0; m_K_blk = 0; - m_copy_B_wei_stride = 0; m_LDB = 0; + m_N = 0; + m_N_blk = 0; + m_K = 0; + m_K_blk = 0; + m_copy_B_wei_stride = 0; + m_LDB = 0; } else { - m_N = N; m_N_blk = N_blk; - m_K = K; m_K_blk = K_blk; - m_copy_B_wei_stride = copy_B_wei_stride; m_LDB = LDB; + m_N = N; + m_N_blk = N_blk; + m_K = K; + m_K_blk = K_blk; + m_copy_B_wei_stride = copy_B_wei_stride; + m_LDB = LDB; } m_hash = compute_hash(); } @@ -55,59 +70,94 @@ void BrgemmCopyBKernelConfig::update(dnnl_dim_t N, dnnl_dim_t N_blk, dnnl_dim_t size_t BrgemmCopyBKernelConfig::compute_hash() const { size_t seed = m_static_params->hash; #define HASH(X) seed = hash_combine(seed, X) - HASH(m_N); HASH(m_N_blk); - HASH(m_K); HASH(m_K_blk); - HASH(m_copy_B_wei_stride); HASH(m_LDB); + HASH(m_N); + HASH(m_N_blk); + HASH(m_K); + HASH(m_K_blk); + HASH(m_copy_B_wei_stride); + HASH(m_LDB); #undef HASH return seed; } -BrgemmCopyBKernelConfig::StaticParams::StaticParams(const element::Type& src_type, const element::Type& wei_type, cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_n_blk) - : src_dt(DTYPE_CAST(src_type)), wei_dt(DTYPE_CAST(wei_type)), isa(isa), - is_with_comp(is_with_comp), is_transposed_B(is_transposed_B), wei_N_blk(wei_n_blk), +BrgemmCopyBKernelConfig::StaticParams::StaticParams(const element::Type& src_type, + const element::Type& wei_type, + cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_n_blk) + : src_dt(DTYPE_CAST(src_type)), + wei_dt(DTYPE_CAST(wei_type)), + isa(isa), + is_with_comp(is_with_comp), + is_transposed_B(is_transposed_B), + wei_N_blk(wei_n_blk), hash(init_hash(src_dt, wei_dt, isa, is_with_comp, is_transposed_B, wei_N_blk)) {} bool BrgemmCopyBKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { #define EQ(X) X == rhs.X - return EQ(hash) && EQ(src_dt) && EQ(wei_dt)&& EQ(isa) && EQ(is_with_comp) && EQ(is_transposed_B) && EQ(wei_N_blk); + return EQ(hash) && EQ(src_dt) && EQ(wei_dt) && EQ(isa) && EQ(is_with_comp) && EQ(is_transposed_B) && EQ(wei_N_blk); #undef EQ } -size_t BrgemmCopyBKernelConfig::StaticParams::init_hash(const dnnl_data_type_t& src_dt, const dnnl_data_type_t& wei_dt, cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk) { +size_t BrgemmCopyBKernelConfig::StaticParams::init_hash(const dnnl_data_type_t& src_dt, + const dnnl_data_type_t& wei_dt, + cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk) { size_t seed = 0; #define HASH(X) seed = hash_combine(seed, X) - HASH(src_dt); HASH(wei_dt); HASH(isa); - HASH(is_with_comp); HASH(is_transposed_B); HASH(wei_N_blk); + HASH(src_dt); + HASH(wei_dt); + HASH(isa); + HASH(is_with_comp); + HASH(is_transposed_B); + HASH(wei_N_blk); #undef HASH return seed; } #ifdef SNIPPETS_DEBUG_CAPS -#define PRINT(X) ss << #X << " = " << X << "\n" +# define PRINT(X) ss << #X << " = " << X << "\n" std::string BrgemmCopyBKernelConfig::to_string() const { std::stringstream ss; ss << m_static_params->to_string() << "\n"; - PRINT(m_hash); PRINT(m_N); PRINT(m_N_blk); - PRINT(m_K); PRINT(m_K_blk); PRINT(m_LDB); PRINT(m_copy_B_wei_stride); + PRINT(m_hash); + PRINT(m_N); + PRINT(m_N_blk); + PRINT(m_K); + PRINT(m_K_blk); + PRINT(m_LDB); + PRINT(m_copy_B_wei_stride); return ss.str(); } std::string BrgemmCopyBKernelConfig::StaticParams::to_string() const { std::stringstream ss; - PRINT(src_dt); PRINT(wei_dt); PRINT(isa); - PRINT(is_with_comp); PRINT(is_transposed_B); PRINT(wei_N_blk); + PRINT(src_dt); + PRINT(wei_dt); + PRINT(isa); + PRINT(is_with_comp); + PRINT(is_transposed_B); + PRINT(wei_N_blk); return ss.str(); } -#undef PRINT +# undef PRINT #endif BrgemmCopyBKernel::BrgemmCopyBKernel() : jit_generator(jit_name()), ker_(nullptr) {} BrgemmCopyBKernel::BrgemmCopyBKernel(const BrgemmCopyBKernelConfig& conf) - : jit_generator(jit_name()), is_with_comp(conf.is_with_comp()), is_transpose(conf.is_transposed_B()), - wei_data_size(dnnl_data_type_size(conf.get_wei_dt())), vnni_factor(data_type_vnni_granularity(conf.get_wei_dt())), - K(conf.get_K()), N_blk(conf.get_N_blk()), wei_N_blk(conf.get_wei_N_blk()), wei_N_tail(conf.get_wei_N_tail()), ker_(nullptr) { + : jit_generator(jit_name()), + is_with_comp(conf.is_with_comp()), + is_transpose(conf.is_transposed_B()), + wei_data_size(dnnl_data_type_size(conf.get_wei_dt())), + vnni_factor(data_type_vnni_granularity(conf.get_wei_dt())), + K(conf.get_K()), + N_blk(conf.get_N_blk()), + wei_N_blk(conf.get_wei_N_blk()), + wei_N_tail(conf.get_wei_N_tail()), + ker_(nullptr) { init_brgemm_copy_b_kernel(dnnl_brgemm_copy_b_kernel, conf); OV_CPU_JIT_EMITTER_ASSERT(dnnl_brgemm_copy_b_kernel, "Kernel is missed!"); } @@ -124,8 +174,9 @@ void BrgemmCopyBKernel::operator()(const call_args* args) const { ker_(args); } -void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(std::unique_ptr& kernel, - const BrgemmCopyBKernelConfig& conf) const { +void BrgemmCopyBKernel::init_brgemm_copy_b_kernel( + std::unique_ptr& kernel, + const BrgemmCopyBKernelConfig& conf) const { matmul::brgemm_matmul_conf_t brgCopyKernelConf; brgCopyKernelConf.src_dt = conf.get_src_dt(); brgCopyKernelConf.wei_dt = conf.get_wei_dt(); @@ -143,8 +194,10 @@ void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(std::unique_ptr(brgCopyKernelConf.wei_dt)); - brgCopyKernelConf.tr_b_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.wei_dt)); + brgCopyKernelConf.b_dt_sz = + DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.wei_dt)); + brgCopyKernelConf.tr_b_dt_sz = + DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.wei_dt)); brgCopyKernelConf.req_wei_vnni_downconvert = false; @@ -191,28 +244,35 @@ void BrgemmCopyBKernel::generate() { postamble(); } -void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) { +void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, + size_t K, + size_t offset_in, + size_t offset_out, + size_t offset_comp) { EmitABIRegSpills spill(this); spill.preamble(); const auto add_offset = [&](Xbyak::Reg64 reg, size_t bytes_offset) { - if (bytes_offset) add(reg, bytes_offset); + if (bytes_offset) + add(reg, bytes_offset); }; // save function address in gpr to pass in call instruction - const auto& kernel_overload = static_cast(execute); + const auto& kernel_overload = static_cast< + void (*)(matmul::jit_brgemm_matmul_copy_b_t*, const void*, const void*, const void*, size_t, size_t)>(execute); mov(rbp, reinterpret_cast(kernel_overload)); mov(abi_param1, reinterpret_cast(dnnl_brgemm_copy_b_kernel.get())); - add_offset(src_reg, offset_in); // abi_param2 - add_offset(tr_src_reg, offset_out); // abi_param3 - if (is_with_comp) // abi_param4 + add_offset(src_reg, offset_in); // abi_param2 + add_offset(tr_src_reg, offset_out); // abi_param3 + if (is_with_comp) // abi_param4 add_offset(comp_reg, offset_comp); else mov(comp_reg, reinterpret_cast(nullptr)); #ifdef _WIN32 - // Note: ABI requires that the remaining parameters (except the first for) are pushed to the stack in right-to-left order + // Note: ABI requires that the remaining parameters (except the first for) are pushed to the stack in right-to-left + // order // Shadow space will be allocated inside internal_call_rsp_align() push(K); push(N); @@ -233,7 +293,12 @@ void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, size_t K, size_ spill.postamble(); } -void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, const void* src, const void* dst, const void* comp, size_t N, size_t K) { +void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, + const void* src, + const void* dst, + const void* comp, + size_t N, + size_t K) { auto ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t(); ctx.current_N_blk = N; ctx.src = src; @@ -248,10 +313,12 @@ void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, cons (*kernel)(&ctx); } -BrgemmCopyBKernelExecutor::BrgemmCopyBKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmCopyBKernelConfig config) - : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) { } +BrgemmCopyBKernelExecutor::BrgemmCopyBKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, + BrgemmCopyBKernelConfig config) + : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) {} -std::shared_ptr BrgemmCopyBKernelExecutor::compile_kernel(const BrgemmCopyBKernelConfig& config) const { +std::shared_ptr BrgemmCopyBKernelExecutor::compile_kernel( + const BrgemmCopyBKernelConfig& config) const { std::shared_ptr compiled_kernel = std::make_shared(); // BrgemmCopyB is not executable - nothing to compile if (!config.is_empty()) { @@ -283,14 +350,16 @@ void BrgemmCopyBKernelExecutor::update_config(const ov::snippets::lowered::Expre const auto& loop_manager = linear_ir->get_loop_manager(); auto init = [&](size_t& dim, size_t& blk, size_t idx) { - OPENVINO_ASSERT(idx < planar_shape.size() && idx < in_subtensor.size(), "Index must be less than shape/subtensor rank!"); + OPENVINO_ASSERT(idx < planar_shape.size() && idx < in_subtensor.size(), + "Index must be less than shape/subtensor rank!"); dim = *(planar_shape.rbegin() + idx); blk = *(in_subtensor.rbegin() + idx); if (ov::snippets::utils::is_full_dim_value(blk)) { blk = dim; } else { OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); - const auto& current_expanded_loop_info = loop_manager->get_loop_info(loop_ids[loop_idx++]); + const auto& current_expanded_loop_info = + loop_manager->get_loop_info(loop_ids[loop_idx++]); blk = current_expanded_loop_info->get_increment(); input_desc->set_subtensor_dim(idx, blk); output_desc->set_subtensor_dim(idx, blk); @@ -306,7 +375,9 @@ void BrgemmCopyBKernelExecutor::update_config(const ov::snippets::lowered::Expre const auto& brg_weight_etype = expr->get_node()->get_input_element_type(0); const auto LDB = brgemm_utils::repacking::compute_LDB(N_dim, brg_weight_etype); - const auto copy_B_wei_stride = ov::snippets::utils::get_dim_stride(expr->get_input_port(0), config.is_transposed_B() ? 0 : 1) * brg_weight_etype.size(); + const auto copy_B_wei_stride = + ov::snippets::utils::get_dim_stride(expr->get_input_port(0), config.is_transposed_B() ? 0 : 1) * + brg_weight_etype.size(); config.update(N_dim, N_blk, K_dim, K_blk, copy_B_wei_stride, LDB); } @@ -318,5 +389,5 @@ void BrgemmCopyBKernelExecutor::execute(const BrgemmCopyBKernelExecutor* executo (*kernel)(args); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp index c4e3f3622ad88f..b3b107cd676705 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp @@ -4,13 +4,12 @@ #pragma once -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" -#include "emitters/snippets/cpu_kernel_executor_table.hpp" - #include #include +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" namespace ov { namespace intel_cpu { @@ -18,11 +17,17 @@ namespace intel_cpu { struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericConfig { public: BrgemmCopyBKernelConfig() = default; - BrgemmCopyBKernelConfig(const element::Type& src_dt, const element::Type& wei_dt, dnnl::impl::cpu::x64::cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk); + BrgemmCopyBKernelConfig(const element::Type& src_dt, + const element::Type& wei_dt, + dnnl::impl::cpu::x64::cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk); bool operator==(const BrgemmCopyBKernelConfig& rhs) const; - bool operator!=(const BrgemmCopyBKernelConfig& rhs) const {return !(*this == rhs);} + bool operator!=(const BrgemmCopyBKernelConfig& rhs) const { + return !(*this == rhs); + } std::unique_ptr get_clone_ptr() const override { return std::unique_ptr(new BrgemmCopyBKernelConfig(*this)); @@ -31,26 +36,61 @@ struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericCon bool is_empty() const; bool is_completed() const override; - void update(dnnl_dim_t N, dnnl_dim_t N_blk, dnnl_dim_t K, dnnl_dim_t K_blk, dnnl_dim_t copy_B_wei_stride, dnnl_dim_t LDB); + void update(dnnl_dim_t N, + dnnl_dim_t N_blk, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t copy_B_wei_stride, + dnnl_dim_t LDB); - size_t hash() const override { return m_hash; } + size_t hash() const override { + return m_hash; + } - dnnl_data_type_t get_src_dt() const { return m_static_params->src_dt; } - dnnl_data_type_t get_wei_dt() const { return m_static_params->wei_dt; } + dnnl_data_type_t get_src_dt() const { + return m_static_params->src_dt; + } + dnnl_data_type_t get_wei_dt() const { + return m_static_params->wei_dt; + } - dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { return m_static_params->isa; } - bool is_with_comp() const { return m_static_params->is_with_comp; } - bool is_transposed_B() const { return m_static_params->is_transposed_B; } + dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { + return m_static_params->isa; + } + bool is_with_comp() const { + return m_static_params->is_with_comp; + } + bool is_transposed_B() const { + return m_static_params->is_transposed_B; + } - dnnl_dim_t get_N() const { return m_N; } - dnnl_dim_t get_N_blk() const { return m_N_blk; } - dnnl_dim_t get_N_tail() const { return m_N % m_N_blk; } - dnnl_dim_t get_wei_N_blk() const { return m_static_params->wei_N_blk; } - dnnl_dim_t get_wei_N_tail() const { return m_N_blk % m_static_params->wei_N_blk; } - dnnl_dim_t get_K() const { return m_K; } - dnnl_dim_t get_K_blk() const { return m_K_blk; } - dnnl_dim_t get_copy_B_wei_stride() const { return m_copy_B_wei_stride; } - dnnl_dim_t get_LDB() const { return m_LDB; } + dnnl_dim_t get_N() const { + return m_N; + } + dnnl_dim_t get_N_blk() const { + return m_N_blk; + } + dnnl_dim_t get_N_tail() const { + return m_N % m_N_blk; + } + dnnl_dim_t get_wei_N_blk() const { + return m_static_params->wei_N_blk; + } + dnnl_dim_t get_wei_N_tail() const { + return m_N_blk % m_static_params->wei_N_blk; + } + dnnl_dim_t get_K() const { + return m_K; + } + dnnl_dim_t get_K_blk() const { + return m_K_blk; + } + dnnl_dim_t get_copy_B_wei_stride() const { + return m_copy_B_wei_stride; + } + dnnl_dim_t get_LDB() const { + return m_LDB; + } #ifdef SNIPPETS_DEBUG_CAPS std::string to_string() const override; @@ -58,35 +98,45 @@ struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericCon private: struct StaticParams { - StaticParams(const element::Type& src_dt, const element::Type& wei_dt, dnnl::impl::cpu::x64::cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk); - - const dnnl_data_type_t src_dt {dnnl_data_type_undef}, wei_dt {dnnl_data_type_undef}; - const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::isa_undef}; - const bool is_with_comp {false}; - const bool is_transposed_B {false}; - const dnnl_dim_t wei_N_blk {0}; - const size_t hash {0}; + StaticParams(const element::Type& src_dt, + const element::Type& wei_dt, + dnnl::impl::cpu::x64::cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk); + + const dnnl_data_type_t src_dt{dnnl_data_type_undef}, wei_dt{dnnl_data_type_undef}; + const dnnl::impl::cpu::x64::cpu_isa_t isa{dnnl::impl::cpu::x64::isa_undef}; + const bool is_with_comp{false}; + const bool is_transposed_B{false}; + const dnnl_dim_t wei_N_blk{0}; + const size_t hash{0}; bool operator==(const StaticParams& rhs) const; - bool operator!=(const StaticParams& rhs) const { return !(*this == rhs); } + bool operator!=(const StaticParams& rhs) const { + return !(*this == rhs); + } #ifdef SNIPPETS_DEBUG_CAPS std::string to_string() const; #endif private: - static size_t init_hash(const dnnl_data_type_t& src_dt, const dnnl_data_type_t& wei_dt, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk); + static size_t init_hash(const dnnl_data_type_t& src_dt, + const dnnl_data_type_t& wei_dt, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk); }; size_t compute_hash() const; std::shared_ptr m_static_params; - dnnl_dim_t m_N {0}, m_N_blk {0}; - dnnl_dim_t m_K {0}, m_K_blk {0}; - dnnl_dim_t m_copy_B_wei_stride {0}, m_LDB {0}; - size_t m_hash {SIZE_MAX}; + dnnl_dim_t m_N{0}, m_N_blk{0}; + dnnl_dim_t m_K{0}, m_K_blk{0}; + dnnl_dim_t m_copy_B_wei_stride{0}, m_LDB{0}; + size_t m_hash{SIZE_MAX}; }; struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator { @@ -109,8 +159,12 @@ struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator { void emit_brgemm_copy_b_kernel_call(size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp); - static void execute(dnnl::impl::cpu::x64::matmul::jit_brgemm_matmul_copy_b_t* kernel, const void* src, const void* dst, const void* comp, - size_t N, size_t K); + static void execute(dnnl::impl::cpu::x64::matmul::jit_brgemm_matmul_copy_b_t* kernel, + const void* src, + const void* dst, + const void* comp, + size_t N, + size_t K); void init_brgemm_copy_b_kernel(std::unique_ptr& kernel, const BrgemmCopyBKernelConfig& conf) const; @@ -151,5 +205,5 @@ class BrgemmCopyBKernelExecutor : public CPUKernelExecutor(); size_t id = SIZE_MAX; switch (port.get_type()) { - case ov::snippets::lowered::ExpressionPort::Type::Input: - offset = ma_op->get_input_offset(port.get_index()); - id = get_cluster_id(port.get_port_connector_ptr()->get_source()); - break; - case ov::snippets::lowered::ExpressionPort::Type::Output: - offset = ma_op->get_output_offset(port.get_index()); - for (const auto& child : port.get_connected_ports()) - if (!ov::is_type(child.get_expr()->get_node())) - id = get_cluster_id(child); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Uknown type of expression port!"); + case ov::snippets::lowered::ExpressionPort::Type::Input: + offset = ma_op->get_input_offset(port.get_index()); + id = get_cluster_id(port.get_port_connector_ptr()->get_source()); + break; + case ov::snippets::lowered::ExpressionPort::Type::Output: + offset = ma_op->get_output_offset(port.get_index()); + for (const auto& child : port.get_connected_ports()) + if (!ov::is_type(child.get_expr()->get_node())) + id = get_cluster_id(child); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Uknown type of expression port!"); } OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(ov::snippets::utils::is_dynamic_value(offset), id != SIZE_MAX), "In dynamic case Buffer Cluster ID must be known!"); @@ -46,31 +45,41 @@ size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port) Xbyak::Reg64 get_aux_gpr(const std::vector& used_gpr_idxs) { // RSP, RBP - stack-related registers, abi_param1 - runtime parameter register in the kernel - static std::unordered_set blacklist_gpr_idxs = { Xbyak::Operand::RSP, Xbyak::Operand::RBP, static_cast(abi_param1.getIdx()) }; + static std::unordered_set blacklist_gpr_idxs = {Xbyak::Operand::RSP, + Xbyak::Operand::RBP, + static_cast(abi_param1.getIdx())}; for (size_t gpr_idx = 0; gpr_idx <= Xbyak::Operand::R15; ++gpr_idx) { - size_t _idx = Xbyak::Operand::R15 - gpr_idx; // we allocate from the end - if (std::find(used_gpr_idxs.cbegin(), used_gpr_idxs.cend(), _idx) != used_gpr_idxs.cend()) continue; - if (blacklist_gpr_idxs.count(_idx) > 0) continue; + size_t _idx = Xbyak::Operand::R15 - gpr_idx; // we allocate from the end + if (std::find(used_gpr_idxs.cbegin(), used_gpr_idxs.cend(), _idx) != used_gpr_idxs.cend()) + continue; + if (blacklist_gpr_idxs.count(_idx) > 0) + continue; return Xbyak::Reg64(_idx); } OV_CPU_JIT_EMITTER_THROW("Failed to allocate aux GPR"); } -void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, Xbyak::Reg64 aux_reg, size_t runtime_offset) { +void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + Xbyak::Reg64 aux_reg, + size_t runtime_offset) { const auto stack_frame = h->qword[h->rsp + stack_offset]; h->mov(aux_reg, ptr_reg); h->add(aux_reg, h->ptr[abi_param1 + runtime_offset]); h->mov(stack_frame, aux_reg); } -void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, size_t ptr_offset) { +void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + size_t ptr_offset) { const auto stack_frame = h->qword[h->rsp + stack_offset]; h->mov(stack_frame, ptr_reg); - if (ptr_offset != 0) h->add(stack_frame, ptr_offset); + if (ptr_offset != 0) + h->add(stack_frame, ptr_offset); } -} // namespace utils -} // namespace intel_cpu -} // namespace ov +} // namespace utils +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp index 97ea86f404fd67..3d8026ea33c750 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp @@ -13,13 +13,17 @@ namespace utils { inline static std::vector transform_idxs_to_regs(const std::vector& idxs) { std::vector regs(idxs.size()); - std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Xbyak::Reg64(static_cast(idx));}); + std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx) { + return Xbyak::Reg64(static_cast(idx)); + }); return regs; } inline static std::vector transform_snippets_regs_to_idxs(const std::vector& regs) { std::vector idxs(regs.size()); - std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { return reg.idx; }); + std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { + return reg.idx; + }); return idxs; } @@ -46,8 +50,11 @@ Xbyak::Reg64 get_aux_gpr(const std::vector& used_gpr_idxs); * @param aux_reg aux register * @param runtime_offset offset in runtime params `abi_param1` */ -void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, Xbyak::Reg64 aux_reg, size_t runtime_offset); +void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + Xbyak::Reg64 aux_reg, + size_t runtime_offset); /** * @brief Push data pointer on stack adding static offset `ptr_offset` @@ -56,9 +63,11 @@ void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* * @param ptr_reg register contains data pointer * @param ptr_offset offset which will be added to data pointer */ -void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, size_t ptr_offset); +void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + size_t ptr_offset); -} // namespace utils -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace utils +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp index 78563bc00aa228..9ac7f0d5cd0ffc 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp @@ -4,17 +4,20 @@ #ifdef SNIPPETS_DEBUG_CAPS -#include "verbose.hpp" -#include "jit_segfault_detector_emitter.hpp" -#include "jit_memory_emitters.hpp" -#include "jit_brgemm_emitter.hpp" -#include "jit_brgemm_copy_b_emitter.hpp" -#include "jit_kernel_emitter.hpp" -#include "jit_snippets_emitters.hpp" - -#ifndef _WIN32 -#include -#endif +# include "verbose.hpp" + +# include "jit_brgemm_copy_b_emitter.hpp" +# include "jit_brgemm_emitter.hpp" +# include "jit_kernel_emitter.hpp" +# include "jit_memory_emitters.hpp" +# include "jit_segfault_detector_emitter.hpp" +# include "jit_snippets_emitters.hpp" +# include "kernel_executors/brgemm.hpp" +# include "kernel_executors/brgemm_amx.hpp" + +# ifndef _WIN32 +# include +# endif namespace ov { namespace intel_cpu { @@ -41,63 +44,59 @@ std::string vector_to_string(const T& v) { std::string get_emitter_type_name(const jit_emitter* emitter) { std::string name = typeid(*emitter).name(); -#ifndef _WIN32 +# ifndef _WIN32 int status; - std::unique_ptr demangled_name( - abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), - std::free); + std::unique_ptr demangled_name(abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), + std::free); name = demangled_name.get(); -#endif +# endif return name; } -std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter) { +std::string init_info_jit_memory_emitter(const jit_memory_emitter* emitter) { std::stringstream ss; - ss << " src_precision:" << emitter->src_prc - << " dst_precision:" << emitter->dst_prc - << " load/store_element_number:" << emitter->count - << " byte_offset:" << emitter->compiled_byte_offset; + ss << " src_precision:" << emitter->src_prc << " dst_precision:" << emitter->dst_prc + << " load/store_element_number:" << emitter->count << " byte_offset:" << emitter->compiled_byte_offset; return ss.str(); } -static std::string init_info_jit_load_memory_emitter(const jit_load_memory_emitter *emitter) { +static std::string init_info_jit_load_memory_emitter(const jit_load_memory_emitter* emitter) { std::stringstream ss; std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_load_memory_emitter" - << memory_emitter_info; + ss << "Emitter_type_name:jit_load_memory_emitter" << memory_emitter_info; return ss.str(); } -static std::string init_info_jit_load_broadcast_emitter(const jit_load_broadcast_emitter *emitter) { +static std::string init_info_jit_load_broadcast_emitter(const jit_load_broadcast_emitter* emitter) { std::stringstream ss; std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_load_broadcast_emitter" - << memory_emitter_info; + ss << "Emitter_type_name:jit_load_broadcast_emitter" << memory_emitter_info; return ss.str(); } -static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emitter *emitter) { +static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emitter* emitter) { std::stringstream ss; std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_store_memory_emitter" - << memory_emitter_info; + ss << "Emitter_type_name:jit_store_memory_emitter" << memory_emitter_info; return ss.str(); } -std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter) { +std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter* emitter) { std::stringstream ss; - ss << "Emitter_type_name:jit_brgemm_emitter" - << emitter->m_kernel_executor->to_string() - << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) + ss << "Emitter_type_name:jit_brgemm_emitter"; + if (const auto& common = std::dynamic_pointer_cast(emitter->m_kernel_executor)) + ss << common->to_string(); + if (const auto& amx = std::dynamic_pointer_cast(emitter->m_kernel_executor)) + ss << amx->to_string(); + ss << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) << " m_buffer_ids:" << vector_to_string(emitter->m_buffer_ids); return ss.str(); } -std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter *emitter) { +std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter* emitter) { std::stringstream ss; - ss << "Emitter_type_name:jit_brgemm_copy_b_emitter" - << emitter->m_kernel_executor->to_string() + ss << "Emitter_type_name:jit_brgemm_copy_b_emitter" << emitter->m_kernel_executor->to_string() << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) << " m_buffer_ids:" << vector_to_string(emitter->m_buffer_ids); @@ -108,11 +107,9 @@ std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* std::stringstream ss; ss << "Emitter_type_name:jit_kernel_static_emitter" << " jcp.exec_domain:" << vector_to_string(emitter->jcp.exec_domain) - << " gp_regs_pool:"<< vector_to_string(emitter->gp_regs_pool) - << " master_shape:" << vector_to_string(emitter->master_shape) - << " num_inputs:" << emitter->num_inputs - << " num_outputs:" << emitter->num_outputs - << " num_unique_buffers:" << emitter->num_unique_buffers + << " gp_regs_pool:" << vector_to_string(emitter->gp_regs_pool) + << " master_shape:" << vector_to_string(emitter->master_shape) << " num_inputs:" << emitter->num_inputs + << " num_outputs:" << emitter->num_outputs << " num_unique_buffers:" << emitter->num_unique_buffers << " data_ptr_regs_idx:" << vector_to_string(emitter->data_ptr_regs_idx) << " vec_regs_pool:" << vector_to_string(emitter->vec_regs_pool) << " reg_indexes_idx:" << emitter->reg_indexes_idx @@ -125,24 +122,20 @@ std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter* emitter) { std::stringstream ss; ss << "Emitter_type_name:jit_kernel_dynamic_emitter" - << " gp_regs_pool:"<< vector_to_string(emitter->gp_regs_pool) - << " num_inputs:" << emitter->num_inputs - << " num_outputs:" << emitter->num_outputs - << " num_unique_buffers:" << emitter->num_unique_buffers + << " gp_regs_pool:" << vector_to_string(emitter->gp_regs_pool) << " num_inputs:" << emitter->num_inputs + << " num_outputs:" << emitter->num_outputs << " num_unique_buffers:" << emitter->num_unique_buffers << " data_ptr_regs_idx:" << vector_to_string(emitter->data_ptr_regs_idx) << " vec_regs_pool:" << vector_to_string(emitter->vec_regs_pool) << " reg_runtime_params_idx:" << emitter->reg_runtime_params_idx; return ss.str(); } -std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter *emitter) { +std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter* emitter) { std::stringstream ss; - ss << "Node_name:" << emitter->m_target_node_name - << " use_load_emitter:"<< emitter->is_target_use_load_emitter - << " use_store_emitter:"<< emitter->is_target_use_store_emitter; + ss << "Node_name:" << emitter->m_target_node_name << " use_load_emitter:" << emitter->is_target_use_load_emitter + << " use_store_emitter:" << emitter->is_target_use_store_emitter; if (emitter->is_target_use_load_emitter || emitter->is_target_use_store_emitter) { - ss << " start_address:" << emitter->start_address - << " current_address:" << emitter->current_address + ss << " start_address:" << emitter->start_address << " current_address:" << emitter->current_address << " iteration:" << emitter->iteration << " "; } // traget emitter info @@ -152,14 +145,15 @@ std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_d return ss.str(); } -static std::string init_info_jit_emitter_general(const jit_emitter *emitter) { +static std::string init_info_jit_emitter_general(const jit_emitter* emitter) { std::stringstream ss; ss << "Emitter_type_name:" << get_emitter_type_name(emitter); return ss.str(); } -void jit_emitter_info_t::init(const jit_emitter *emitter) { - if (is_initialized_) return; +void jit_emitter_info_t::init(const jit_emitter* emitter) { + if (is_initialized_) + return; if (auto e_type = dynamic_cast(emitter)) { str_ = init_info_jit_load_memory_emitter(e_type); } else if (auto e_type = dynamic_cast(emitter)) { @@ -182,7 +176,7 @@ void jit_emitter_info_t::init(const jit_emitter *emitter) { is_initialized_ = true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp index a81364039b98a7..ffbe210f75d2ff 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp @@ -4,27 +4,30 @@ #ifdef SNIPPETS_DEBUG_CAPS -#pragma once +# pragma once -#include +# include namespace ov { namespace intel_cpu { class jit_emitter; struct jit_emitter_info_t { jit_emitter_info_t() = default; - jit_emitter_info_t(const jit_emitter_info_t &rhs) - : str_(rhs.str_), is_initialized_(rhs.is_initialized_) {} - jit_emitter_info_t &operator=(const jit_emitter_info_t &rhs) { + jit_emitter_info_t(const jit_emitter_info_t& rhs) : str_(rhs.str_), is_initialized_(rhs.is_initialized_) {} + jit_emitter_info_t& operator=(const jit_emitter_info_t& rhs) { is_initialized_ = rhs.is_initialized_; str_ = rhs.str_; return *this; } - const char *c_str() const { return str_.c_str(); } - bool is_initialized() const { return is_initialized_; } + const char* c_str() const { + return str_.c_str(); + } + bool is_initialized() const { + return is_initialized_; + } - void init(const jit_emitter *emitter); + void init(const jit_emitter* emitter); private: std::string str_; @@ -33,7 +36,7 @@ struct jit_emitter_info_t { std::string get_emitter_type_name(const jit_emitter* emitter); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/utils.cpp b/src/plugins/intel_cpu/src/emitters/utils.cpp index b92277ae643218..43172e1b600843 100644 --- a/src/plugins/intel_cpu/src/emitters/utils.cpp +++ b/src/plugins/intel_cpu/src/emitters/utils.cpp @@ -7,28 +7,29 @@ namespace ov { namespace intel_cpu { -std::string jit_emitter_pretty_name(const std::string &pretty_func) { -#define SAFE_SYMBOL_FINDING(idx, find) \ - auto idx = (find); \ +std::string jit_emitter_pretty_name(const std::string& pretty_func) { +#define SAFE_SYMBOL_FINDING(idx, find) \ + auto idx = (find); \ if (idx == std::string::npos || idx == 0) \ return pretty_func; // Example: - // pretty_func := void ov::intel_cpu::jit_load_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const - // begin := -----------| - // end := ---------------------------------------------------| - // result := ov::intel_cpu::jit_load_memory_emitter + // pretty_func := void ov::intel_cpu::jit_load_memory_emitter::emit_impl(const std::vector& in, const + // std::vector& out) const begin := -----------| end := + // ---------------------------------------------------| result := ov::intel_cpu::jit_load_memory_emitter // Signatures: // GCC: void foo() [with T = {type}] // clang: void foo() [T = {type}] // MSVC: void __cdecl foo<{type}>(void) SAFE_SYMBOL_FINDING(parenthesis, pretty_func.find("(")) - if (pretty_func[parenthesis - 1] == '>') { // To cover template on MSVC + if (pretty_func[parenthesis - 1] == '>') { // To cover template on MSVC parenthesis--; size_t counter = 1; while (counter != 0 && parenthesis > 0) { parenthesis--; - if (pretty_func[parenthesis] == '>') counter++; - if (pretty_func[parenthesis] == '<') counter--; + if (pretty_func[parenthesis] == '>') + counter++; + if (pretty_func[parenthesis] == '<') + counter--; } } SAFE_SYMBOL_FINDING(end, pretty_func.substr(0, parenthesis).rfind("::")) @@ -38,5 +39,5 @@ std::string jit_emitter_pretty_name(const std::string &pretty_func) { return end > begin ? pretty_func.substr(begin, end - begin) : pretty_func; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/utils.hpp b/src/plugins/intel_cpu/src/emitters/utils.hpp index 4c3210579d7fd2..7c89b720159dde 100644 --- a/src/plugins/intel_cpu/src/emitters/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/utils.hpp @@ -5,21 +5,22 @@ #pragma once #include + #include "openvino/core/except.hpp" namespace ov { namespace intel_cpu { -std::string jit_emitter_pretty_name(const std::string &pretty_func); +std::string jit_emitter_pretty_name(const std::string& pretty_func); #ifdef __GNUC__ -#define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__PRETTY_FUNCTION__) +# define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__PRETTY_FUNCTION__) #else /* __GNUC__ */ -#define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__FUNCSIG__) +# define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__FUNCSIG__) #endif /* __GNUC__ */ -#define OV_CPU_JIT_EMITTER_THROW(...) OPENVINO_THROW(OV_CPU_JIT_EMITTER_NAME, ": ", __VA_ARGS__) +#define OV_CPU_JIT_EMITTER_THROW(...) OPENVINO_THROW(OV_CPU_JIT_EMITTER_NAME, ": ", __VA_ARGS__) #define OV_CPU_JIT_EMITTER_ASSERT(cond, ...) OPENVINO_ASSERT((cond), OV_CPU_JIT_EMITTER_NAME, ": ", __VA_ARGS__) -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index a29282d4af3101..bdb5211009a22a 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -7,6 +7,10 @@ #include "openvino/core/op_extension.hpp" #include "ov_ops/augru_cell.hpp" #include "ov_ops/augru_sequence.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_compressed.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" #include "ov_ops/gather_compressed.hpp" #include "ov_ops/multiclass_nms_ie_internal.hpp" #include "ov_ops/nms_ie_internal.hpp" @@ -16,15 +20,14 @@ #include "ov_ops/type_relaxed.hpp" #include "snippets/op/subgraph.hpp" #include "transformations/cpu_opset/common/op/causal_mask_preprocess.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" #include "transformations/cpu_opset/common/op/leaky_relu.hpp" #include "transformations/cpu_opset/common/op/ngram.hpp" #include "transformations/cpu_opset/common/op/power_static.hpp" #include "transformations/cpu_opset/common/op/sdpa.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/cpu_opset/x64/op/interaction.hpp" -#include "transformations/cpu_opset/x64/op/mha.hpp" #include "transformations/cpu_opset/x64/op/llm_mlp.hpp" +#include "transformations/cpu_opset/x64/op/mha.hpp" #include "transformations/cpu_opset/x64/op/qkv_proj.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -37,8 +40,7 @@ namespace { template class TypeRelaxedExtension : public ov::OpExtension> { public: - TypeRelaxedExtension() - : m_ext_type(Op::get_type_info_static().name, "type_relaxed_opset") {} + TypeRelaxedExtension() : m_ext_type(Op::get_type_info_static().name, "type_relaxed_opset") {} ~TypeRelaxedExtension() override = default; const ov::DiscreteTypeInfo& get_type_info() const override { @@ -70,7 +72,6 @@ class TypeRelaxedExtension : public ov::OpExtension> { #endif #define CPU_EXTENSIONS \ - OP_EXTENSION(ov::intel_cpu::FullyConnectedNode) \ OP_EXTENSION(ov::intel_cpu::LeakyReluNode) \ OP_EXTENSION(ov::intel_cpu::PowerStaticNode) \ OP_EXTENSION(ov::intel_cpu::CausalMaskPreprocessNode) \ @@ -85,6 +86,10 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::op::internal::NmsStaticShapeIE) \ OP_EXTENSION(ov::op::internal::RMS) \ OP_EXTENSION(ov::op::internal::RoPE) \ + OP_EXTENSION(ov::op::internal::FullyConnected) \ + OP_EXTENSION(ov::op::internal::FullyConnectedCompressed) \ + OP_EXTENSION(ov::op::internal::FullyConnectedQuantizedLegacy) \ + OP_EXTENSION(ov::op::internal::FullyConnectedQuantized) \ OP_EXTENSION_X64(ov::intel_cpu::MHANode) \ OP_EXTENSION_X64(ov::intel_cpu::InteractionNode) \ OP_EXTENSION_X64(ov::intel_cpu::LLMMLPNode) \ @@ -153,31 +158,31 @@ class TypeRelaxedExtension : public ov::OpExtension> { # define SNIPPETS_DEBUG_CAPS_EXTENSIONS #endif -#define SNIPPETS_EXTENSIONS \ - OP_EXTENSION(ov::snippets::op::Brgemm) \ - OP_EXTENSION(ov::snippets::op::BroadcastLoad) \ - OP_EXTENSION(ov::snippets::op::BroadcastMove) \ - OP_EXTENSION(ov::snippets::op::ConvertSaturation) \ - OP_EXTENSION(ov::snippets::op::ConvertTruncation) \ - OP_EXTENSION(ov::snippets::op::Fill) \ - OP_EXTENSION(ov::snippets::op::HorizonMax) \ - OP_EXTENSION(ov::snippets::op::HorizonSum) \ - OP_EXTENSION(ov::snippets::op::KernelStatic) \ - OP_EXTENSION(ov::snippets::op::KernelDynamic) \ - OP_EXTENSION(ov::snippets::op::Load) \ - OP_EXTENSION(ov::snippets::op::LoadReshape) \ - OP_EXTENSION(ov::snippets::op::LoopBegin) \ - OP_EXTENSION(ov::snippets::op::LoopEnd) \ - OP_EXTENSION(ov::snippets::op::Buffer) \ - OP_EXTENSION(ov::snippets::op::Nop) \ - OP_EXTENSION(ov::snippets::op::PowerStatic) \ - OP_EXTENSION(ov::snippets::op::Scalar) \ - OP_EXTENSION(ov::snippets::op::Store) \ - OP_EXTENSION(ov::snippets::op::Subgraph) \ - OP_EXTENSION(ov::snippets::op::VectorBuffer) \ - OP_EXTENSION(ov::snippets::op::RankNormalization) \ - OP_EXTENSION(ov::snippets::op::ReduceMax) \ - OP_EXTENSION(ov::snippets::op::ReduceSum) \ +#define SNIPPETS_EXTENSIONS \ + OP_EXTENSION(ov::snippets::op::Brgemm) \ + OP_EXTENSION(ov::snippets::op::BroadcastLoad) \ + OP_EXTENSION(ov::snippets::op::BroadcastMove) \ + OP_EXTENSION(ov::snippets::op::ConvertSaturation) \ + OP_EXTENSION(ov::snippets::op::ConvertTruncation) \ + OP_EXTENSION(ov::snippets::op::Fill) \ + OP_EXTENSION(ov::snippets::op::HorizonMax) \ + OP_EXTENSION(ov::snippets::op::HorizonSum) \ + OP_EXTENSION(ov::snippets::op::KernelStatic) \ + OP_EXTENSION(ov::snippets::op::KernelDynamic) \ + OP_EXTENSION(ov::snippets::op::Load) \ + OP_EXTENSION(ov::snippets::op::LoadReshape) \ + OP_EXTENSION(ov::snippets::op::LoopBegin) \ + OP_EXTENSION(ov::snippets::op::LoopEnd) \ + OP_EXTENSION(ov::snippets::op::Buffer) \ + OP_EXTENSION(ov::snippets::op::Nop) \ + OP_EXTENSION(ov::snippets::op::PowerStatic) \ + OP_EXTENSION(ov::snippets::op::Scalar) \ + OP_EXTENSION(ov::snippets::op::Store) \ + OP_EXTENSION(ov::snippets::op::Subgraph) \ + OP_EXTENSION(ov::snippets::op::VectorBuffer) \ + OP_EXTENSION(ov::snippets::op::RankNormalization) \ + OP_EXTENSION(ov::snippets::op::ReduceMax) \ + OP_EXTENSION(ov::snippets::op::ReduceSum) \ OP_EXTENSION(ov::snippets::op::Reshape) OPENVINO_CREATE_EXTENSIONS(std::vector( diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 6aa4644f902bc9..aab78a4d5f15bd 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include +#include "common/primitive_desc_iface.hpp" #include "edge.h" #include "graph_dumper.h" #include "graph_optimizer.h" @@ -28,25 +30,21 @@ #include "nodes/common/cpu_memcpy.h" #include "nodes/convert.h" #include "nodes/input.h" -#include "nodes/reorder.h" #include "nodes/memory.hpp" +#include "nodes/reorder.h" #include "openvino/core/except.hpp" #include "openvino/core/model.hpp" #include "openvino/core/node.hpp" +#include "openvino/core/parallel.hpp" #include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/exception.hpp" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" #include "utils/node_dumper.h" -#include "utils/verbose.h" #include "utils/precision_support.h" - -#include -#include "common/primitive_desc_iface.hpp" - -#include "openvino/runtime/exception.hpp" -#include "openvino/runtime/threading/cpu_streams_executor.hpp" -#include "openvino/core/parallel.hpp" +#include "utils/verbose.h" #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) # include @@ -61,8 +59,8 @@ Graph::~Graph() { CPU_DEBUG_CAP_ENABLE(average_counters(*this)); } -template -void Graph::CreateGraph(NET &model, const GraphContext::CPtr context) { +template +void Graph::CreateGraph(NET& model, const GraphContext::CPtr context) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "CreateGraph"); Init(model, context); @@ -104,7 +102,7 @@ void Graph::CreateGraph(const std::vector& graphNodes, template void Graph::CreateGraph(const std::shared_ptr&, const GraphContext::CPtr); -void Graph::Replicate(const std::shared_ptr &model, +void Graph::Replicate(const std::shared_ptr& model, const std::vector& inputConfigs, const std::vector& outputConfigs) { OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model"); @@ -135,7 +133,9 @@ void Graph::Replicate(const std::shared_ptr &model, if (op->get_type_info() == op::v0::Parameter::get_type_info_static()) { auto input_index = model->get_parameter_index(std::dynamic_pointer_cast(op)); OPENVINO_ASSERT(input_index >= 0, - "CPU plugin cannot find op: ", op->get_friendly_name(), " in model parameter list!"); + "CPU plugin cannot find op: ", + op->get_friendly_name(), + " in model parameter list!"); const auto& config = static_cast(input_index) < inputConfigs.size() ? inputConfigs[input_index] : node::Input::InputConfig{}; @@ -152,7 +152,9 @@ void Graph::Replicate(const std::shared_ptr &model, if (op->get_type_info() == op::v0::Result::get_type_info_static()) { auto output_index = model->get_result_index(std::dynamic_pointer_cast(op)); OPENVINO_ASSERT(output_index >= 0, - "CPU plugin cannot find op: ", op->get_friendly_name(), " in model result list!"); + "CPU plugin cannot find op: ", + op->get_friendly_name(), + " in model result list!"); const auto& config = static_cast(output_index) < outputConfigs.size() ? outputConfigs[output_index] : node::Input::OutputConfig{}; @@ -179,9 +181,9 @@ void Graph::Replicate(const std::shared_ptr &model, } if (!one_of(op->get_type_info(), - op::v0::Result::get_type_info_static(), - op::v3::Assign::get_type_info_static(), - op::v6::Assign::get_type_info_static())) { + op::v0::Result::get_type_info_static(), + op::v3::Assign::get_type_info_static(), + op::v6::Assign::get_type_info_static())) { for (size_t oi = 0; oi < op->get_output_size(); oi++) { if (op->get_output_target_inputs(oi).empty()) { unusedOutputs.push_back(op->output(oi)); @@ -194,10 +196,13 @@ void Graph::Replicate(const std::shared_ptr &model, for (auto unusedOutput : unusedOutputs) { auto parentNode = op2node[unusedOutput.get_node_shared_ptr()]; const auto port = unusedOutput.get_index(); - const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName(); + const auto nodeName = + std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName(); const NodePtr outNode = std::make_shared(parentNode->outputShapes[port], parentNode->getOriginalOutputPrecisionAtPort(port), - nodeName, "Result", m_context); + nodeName, + "Result", + m_context); CreateEdge(parentNode, outNode, port, 0); AddNode(outNode); } @@ -216,7 +221,7 @@ void Graph::Replicate(const std::shared_ptr &model, EnforceInferencePrecision(); // update input precisions of consumers to avoid extra reorders - for (auto &input : inputNodesMap) { + for (auto& input : inputNodesMap) { const auto& inputNode = input.second; const auto precToSet = inputNode->getOriginalOutputPrecisionAtPort(0); const auto childEdges = inputNode->getChildEdgesAtPort(0); @@ -233,7 +238,7 @@ void Graph::Replicate(const std::shared_ptr &model, // update output precisions of producers to avoid extra reorders // do this only in case output configration is not provided explicitly if (outputConfigs.empty()) { - for (auto &output : outputNodesMap) { + for (auto& output : outputNodesMap) { const auto& outputNode = output.second; const auto precToSet = outputNode->getOriginalInputPrecisionAtPort(0); const auto parentEdge = outputNode->getParentEdgeAt(0); @@ -254,11 +259,12 @@ static std::vector IdentifySyncPoints(const std::vector& graphN continue; if (node->outputShapeDataDependency() || - // WA: for convolution plus sum(broadcast). Due to the fact that a convolution with sum use the same memory for second sum term and the output - // tensors (inPlace) resizing the output tensor, may lead to reallocation of this second term memory and possible data lost. The reallocation - // may happen when the second term shape is broadcasted to the output tensor shape. To avoid the data loss, we have a special processing for - // such cases inside the convolution node, but it works properly only when dynamic shapes inference, preparation and execution a called - // for this node sequentially. + // WA: for convolution plus sum(broadcast). Due to the fact that a convolution with sum use the same memory + // for second sum term and the output tensors (inPlace) resizing the output tensor, may lead to reallocation + // of this second term memory and possible data lost. The reallocation may happen when the second term shape + // is broadcasted to the output tensor shape. To avoid the data loss, we have a special processing for such + // cases inside the convolution node, but it works properly only when dynamic shapes inference, preparation + // and execution a called for this node sequentially. (node->getType() == Type::Convolution && node->isInPlace()) || // Due to the special handling of the internal states and initialization subgraphs, MemoryInput nodes must // be processed as a internal dynamism node, allowing to hide the aforementioned complexity inside the @@ -271,15 +277,17 @@ static std::vector IdentifySyncPoints(const std::vector& graphN return syncNodesInds; } -static std::tuple, std::vector> ExtractExecutableNodesAndSyncPoints(const std::vector& syncNodesInds, - const std::vector& graphNodes) { +static std::tuple, std::vector> ExtractExecutableNodesAndSyncPoints( + const std::vector& syncNodesInds, + const std::vector& graphNodes) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ExtractExecutableNodesAndSyncPoints"); std::unordered_map graphIdToExecutableId; std::vector executableGraphNodes; for (size_t i = 0; i < graphNodes.size(); i++) { const auto& graphNode = graphNodes[i]; - if ((!graphNode->isConstant() && graphNode->isExecutable()) || // non-constant executable or - (graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs + if ((!graphNode->isConstant() && graphNode->isExecutable()) || // non-constant executable or + (graphNode->isDynamicNode() && + !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs graphIdToExecutableId[i] = executableGraphNodes.size(); executableGraphNodes.emplace_back(graphNode); } @@ -291,17 +299,17 @@ static std::tuple, std::vector> ExtractExecutableNo auto it = graphIdToExecutableId.find(syncNodesInd); if (it != graphIdToExecutableId.end()) { uniqueExecutableSyncNodesInds.insert(it->second); - // since sometimes we need to run the synchronization node alone (for example in the case of internal dynamism) - // let's add another sync index after the sync point node + // since sometimes we need to run the synchronization node alone (for example in the case of internal + // dynamism) let's add another sync index after the sync point node uniqueExecutableSyncNodesInds.insert(it->second + 1); } } uniqueExecutableSyncNodesInds.insert(executableGraphNodes.size()); // convert to a vector to reduce runtime overhead - std::vector executableSyncNodesInds(uniqueExecutableSyncNodesInds.begin(), uniqueExecutableSyncNodesInds.end()); + std::vector executableSyncNodesInds(uniqueExecutableSyncNodesInds.begin(), + uniqueExecutableSyncNodesInds.end()); - return std::make_tuple(std::move(executableGraphNodes), - std::move(executableSyncNodesInds)); + return std::make_tuple(std::move(executableGraphNodes), std::move(executableSyncNodesInds)); } void Graph::Init(const std::shared_ptr& model, @@ -346,7 +354,7 @@ static void UseExternalOutputMemory(const std::map& output } void Graph::Activate(const std::vector& externalInputMemory, - const std::vector& externalOutputMemory) { + const std::vector& externalOutputMemory) { OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); const bool hasDynNodes = ProcessDynNodes(); @@ -360,12 +368,13 @@ void Graph::Activate(const std::vector& externalInputMemory, CreatePrimitivesAndExecConstants(); #ifndef CPU_DEBUG_CAPS - for (auto &graphNode : graphNodes) { + for (auto& graphNode : graphNodes) { graphNode->cleanup(); } #endif - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = + ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); if (hasDynNodes) { status = Status::ReadyDynamic; @@ -424,7 +433,7 @@ void Graph::Configure(bool optimize) { void Graph::InitNodes() { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::InitNodes"); - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { node->init(); } } @@ -432,7 +441,7 @@ void Graph::InitNodes() { void Graph::InitDescriptors() { OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "InitDescriptors", "Prepare"); - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, node->profiling.getSupportedDescriptors); DEBUG_LOG("Get supported primitive descriptors for node: ", node->getName()); node->getSupportedDescriptors(); @@ -445,15 +454,15 @@ void Graph::InitDescriptors() { const auto& SPDs = node->getSupportedPrimitiveDescriptors(); for (size_t i = 0; i < SPDs.size(); i++) { DEBUG_LOG("#", - node->getExecIndex(), - " ", - node->getName(), - " Before filter, SupportedPrimitiveDescriptors [", - i, - "/", - SPDs.size(), - "]: \n", - SPDs[i]); + node->getExecIndex(), + " ", + node->getName(), + " Before filter, SupportedPrimitiveDescriptors [", + i, + "/", + SPDs.size(), + "]: \n", + SPDs[i]); } } #endif @@ -478,7 +487,7 @@ void Graph::InitDescriptors() { #endif } - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, node->profiling.selectOptimalPrimitiveDescriptor); DEBUG_LOG("Select optimal primitive descriptors for node: ", node->getName()); node->selectOptimalPrimitiveDescriptor(); @@ -495,12 +504,18 @@ void Graph::ResolveInplaceDirections() { void Graph::InitOptimalPrimitiveDescriptors() { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Graph::InitOptimalPrimitiveDescriptors"); - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.initOptimalPrimitiveDescriptor); DEBUG_LOG("Init optimal primitive descriptors for node: ", node->getName()); node->initOptimalPrimitiveDescriptor(); - DEBUG_LOG("#", node->getExecIndex(), " ", node->getName(), "\n", - *node->getSelectedPrimitiveDescriptor(), "selectedPrimitiveDescriptorIdx = ", node->selectedPrimitiveDescriptorIndex); + DEBUG_LOG("#", + node->getExecIndex(), + " ", + node->getName(), + "\n", + *node->getSelectedPrimitiveDescriptor(), + "selectedPrimitiveDescriptorIdx = ", + node->selectedPrimitiveDescriptorIndex); } } @@ -508,7 +523,7 @@ void Graph::CreatePrimitivesAndExecConstants() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::CreatePrimitivesAndExecConstants"); using shared_memory_ptr = WeightsSharing::SharedMemory::Ptr; - auto acquireSharedOutputs = [this](const NodePtr & node) { + auto acquireSharedOutputs = [this](const NodePtr& node) { std::vector outputs; bool hasLocalAllocatedEdges = false; bool hasExternalInvalidEdges = false; @@ -530,14 +545,14 @@ void Graph::CreatePrimitivesAndExecConstants() const { return std::make_tuple(hasExternalInvalidEdges, hasLocalAllocatedEdges, outputs); }; - for (const auto &node : graphNodes) { + for (const auto& node : graphNodes) { { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.createPrimitive); DEBUG_LOG(*node); node->createPrimitive(); } - if (!node->isConstant()) { + if (!node->isConstant() || !node->isExecutable()) { continue; } @@ -547,7 +562,7 @@ void Graph::CreatePrimitivesAndExecConstants() const { if (std::get<0>(sharedOutputs) || std::get<1>(sharedOutputs)) { ExecuteNodeWithCatch(node); - for (auto & output : std::get<2>(sharedOutputs)) + for (auto& output : std::get<2>(sharedOutputs)) output->valid(true); } } else { @@ -556,7 +571,9 @@ void Graph::CreatePrimitivesAndExecConstants() const { } } -static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDescPtr& childDesc, const dnnl::engine& eng) { +static bool isReorderAvailable(const MemoryDescPtr& parentDesc, + const MemoryDescPtr& childDesc, + const dnnl::engine& eng) { auto definedParentDesc = parentDesc->isDefined() ? parentDesc : MemoryDescUtils::makeDummyDesc(*parentDesc); memory::desc srcMemDesc = MemoryDescUtils::convertToDnnlMemoryDesc(definedParentDesc)->getDnnlDesc(); @@ -566,14 +583,16 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc dnnl::primitive_attr attr; dnnl_primitive_desc_t result = nullptr; - auto status = dnnl_reorder_primitive_desc_create(&result, srcMemDesc.get(), eng.get(), dstMemDesc.get(), eng.get(), + auto status = dnnl_reorder_primitive_desc_create(&result, + srcMemDesc.get(), + eng.get(), + dstMemDesc.get(), + eng.get(), attr.get()); #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) // temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM // pretend the reorder is not available to use Convert node instead - if (hasHardwareSupport(ov::element::f16) && - result && - parse_impl_name(result->impl()->name()) == ref_any) { + if (hasHardwareSupport(ov::element::f16) && result && parse_impl_name(result->impl()->name()) == ref_any) { dnnl_primitive_desc_destroy(result); return false; } @@ -587,8 +606,8 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc void Graph::insertReorder(EdgePtr& edge, bool isOptimized, std::unordered_set& uniqueLayerNames) { std::string basicLayerName = edge->getParent()->getName() + "_" + - node::Reorder::getReorderArgs(edge->getInputDesc(), edge->getOutputDesc()) + "_" + - edge->getChild()->getName(); + node::Reorder::getReorderArgs(edge->getInputDesc(), edge->getOutputDesc()) + "_" + + edge->getChild()->getName(); std::string layerName = basicLayerName; int idx = 0; while (uniqueLayerNames.find(layerName) != uniqueLayerNames.end()) { @@ -605,11 +624,14 @@ void Graph::insertConvert(EdgePtr& edge) { const auto& inDesc = edge->getInputDesc(); const auto& outDesc = edge->getOutputDesc(); - std::string convertName = edge->getParent()->getName() + "_" + - inDesc.getPrecision().get_type_name() + "_" + outDesc.getPrecision().get_type_name(); + std::string convertName = edge->getParent()->getName() + "_" + inDesc.getPrecision().get_type_name() + "_" + + outDesc.getPrecision().get_type_name(); - auto convertNode = std::make_shared(inDesc.getShape(), inDesc.getPrecision(), outDesc.getPrecision(), - convertName, m_context); + auto convertNode = std::make_shared(inDesc.getShape(), + inDesc.getPrecision(), + outDesc.getPrecision(), + convertName, + m_context); convertNode->setDescs(inDesc, outDesc); InsertNode(edge, convertNode, true); } @@ -720,9 +742,9 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { // Resolve special cases: for (size_t i = 0; i < remaining_edge_clusters_count;) { - auto &cluster = edge_clusters[i]; + auto& cluster = edge_clusters[i]; bool erase = false; - for (auto &edge : cluster) { + for (auto& edge : cluster) { // Remove already allocated edges from the mem reuse algo if (edge->getStatus() == Edge::Status::Allocated) { erase = true; @@ -730,18 +752,23 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } // Special allocation for string tensors - if (edge->getDesc().getPrecision() == element::string && edge->getStatus() == Edge::Status::NeedAllocation) { + if (edge->getDesc().getPrecision() == element::string && + edge->getStatus() == Edge::Status::NeedAllocation) { StringMemory::StringMemoryBlockPtr memBlcok; if (edge->getParent()->isConstant()) { if (edge->getParent()->getType() == Type::Input) { - auto constNode = static_cast(edge->getParent().get()); + auto constNode = static_cast(edge->getParent().get()); edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); } else { edge->externalAllocate(m_context->getWeightsCache()); } - auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); - OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", - edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); + auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); + OPENVINO_ASSERT(stringMemory, + "[CPU] Edge between nodes '", + edge->getParent()->getName(), + "' and '", + edge->getChild()->getName(), + "' must have StringMemory."); memBlcok = stringMemory->getStringMemoryBlockPtr(); } else { auto memory = std::make_shared(getEngine(), edge->getDesc()); @@ -752,13 +779,18 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { if (edge_c == edge) { continue; } - OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, "All edges in the cluster must be string."); + OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, + "All edges in the cluster must be string."); if (edge_c->getStatus() == Edge::Status::NotAllocated) { auto memory = std::make_shared(getEngine(), edge_c->getDesc(), memBlcok); edge_c->reuse(memory); } else { - OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", edge_c->getParent()->getName(), "' and '", - edge_c->getChild()->getName(), "' has an unexpected status: ", static_cast(edge_c->getStatus())); + OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", + edge_c->getParent()->getName(), + "' and '", + edge_c->getChild()->getName(), + "' has an unexpected status: ", + static_cast(edge_c->getStatus())); } } erase = true; @@ -800,14 +832,15 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { int64_t boxSize = 0; bool isConst = false, isOutput = false, isInput = false; - for (auto &edge : edge_clusters[i]) { + for (auto& edge : edge_clusters[i]) { int e_start = edge->getParent()->getExecIndex(); int e_finish = edge->getChild()->getExecIndex(); auto&& desc = edge->getDesc(); if (boxSize != -1 && desc.isDefined()) { - int64_t e_size = desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) + int64_t e_size = + desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) boxSize = std::max(e_size, boxSize); } else { boxSize = -1; @@ -824,9 +857,9 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } reg.alloc_type = allocType; - isConst |= isConstOutput(edge); + isConst |= isConstOutput(edge); isOutput |= edge->getChild()->getType() == Type::Output; - isInput |= edge->getParent()->getType() == Type::Input; + isInput |= edge->getParent()->getType() == Type::Input; } reg.size = boxSize; @@ -878,7 +911,7 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { memoryRegions.erase(it, memoryRegions.end()); - //Set up the memory control subsystem. + // Set up the memory control subsystem. this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds)); auto memoryBlocks = m_pMemoryControl->insert(memoryRegions); @@ -911,9 +944,8 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } std::vector edges_to_process; edges_to_process.push_back(edge); - for (auto next_edge = edge->getSharedEdge(std::nothrow); - next_edge; - next_edge = next_edge->getSharedEdge(std::nothrow)) { + for (auto next_edge = edge->getSharedEdge(std::nothrow); next_edge; + next_edge = next_edge->getSharedEdge(std::nothrow)) { edges_to_process.push_back(next_edge); } std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { @@ -937,16 +969,15 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { void Graph::Allocate(const std::vector& syncNodesInds) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::Allocate"); - //resolve inplace dead end nodes + // resolve inplace dead end nodes for (const auto& edge : graphEdges) { if (edge->getStatus() == Edge::Status::Uninitialized) { if (edge->getParent()->getParentEdges().empty() && - one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && - edge->inPlace(Edge::LOOK_UP)) { + one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && edge->inPlace(Edge::LOOK_UP)) { edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); } else if (edge->getChild()->getChildEdges().empty() && - one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) && - edge->inPlace(Edge::LOOK_DOWN)) { + one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) && + edge->inPlace(Edge::LOOK_DOWN)) { edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); } } @@ -955,13 +986,15 @@ void Graph::Allocate(const std::vector& syncNodesInds) { // resolve edges. Define which will be a view on others // NeedAllocation - real blob // NotAllocated - view on other blob, peer or in-place - for (auto& edge : graphEdges) edge->init(); + for (auto& edge : graphEdges) + edge->init(); // Allocate memory space for all edges marked with NeedAllocation AllocateWithReuse(syncNodesInds); // Check all getters. Should work. - for (auto& edge : graphEdges) edge->validate(); + for (auto& edge : graphEdges) + edge->validate(); } bool Graph::ProcessDynNodes() { @@ -975,7 +1008,8 @@ bool Graph::ProcessDynNodes() { } void Graph::PushInputData(const std::size_t& index, const ov::SoPtr& input) { - if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready."); + if (!IsReady()) + OPENVINO_THROW("Wrong state. Topology not ready."); auto input_itr = inputNodesMap.find(index); if (input_itr != inputNodesMap.end()) { auto node = input_itr->second; @@ -1010,7 +1044,7 @@ void Graph::PullOutputData(std::unordered_map>& if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready."); - for (auto &outputMap : outputNodesMap) { + for (auto& outputMap : outputNodesMap) { auto output_index = outputMap.first; auto node = outputMap.second; auto parentEdge = node->getParentEdgeAt(0); @@ -1040,17 +1074,32 @@ void Graph::PullOutputData(std::unordered_map>& if (ext_blob->get_shape() != outDims && !isScalarOutput) { // WA: because input/output info initially contains non empty dims, order etc. // and setDims (called inside setShape) can't correct modify blocked desc for desc with blocked layout - DEBUG_LOG(output_index, ", tensor data addr ", static_cast(output[output_index]->data()), - " dims ", PartialShape(output[output_index]->get_shape()), " -> ", PartialShape(outDims), - ", intr ptr ", intr_blob.getData(), " , parentedge's memory object ", parentEdge->getMemoryPtr().get()); + DEBUG_LOG(output_index, + ", tensor data addr ", + static_cast(output[output_index]->data()), + " dims ", + PartialShape(output[output_index]->get_shape()), + " -> ", + PartialShape(outDims), + ", intr ptr ", + intr_blob.getData(), + " , parentedge's memory object ", + parentEdge->getMemoryPtr().get()); ext_blob->set_shape(outDims); - DEBUG_LOG(output_index, ", tensor data addr ", static_cast(output[output_index]->data()), - " dims ", PartialShape(output[output_index]->get_shape()), ", intr ptr ", intr_blob.getData()); + DEBUG_LOG(output_index, + ", tensor data addr ", + static_cast(output[output_index]->data()), + " dims ", + PartialShape(output[output_index]->get_shape()), + ", intr ptr ", + intr_blob.getData()); expected_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(ext_blob); } // check for empty output blob - if (std::any_of(outDims.begin(), outDims.end(), [](const Dim dim) {return dim == 0;})) { + if (std::any_of(outDims.begin(), outDims.end(), [](const Dim dim) { + return dim == 0; + })) { continue; } @@ -1063,12 +1112,22 @@ void Graph::PullOutputData(std::unordered_map>& intr_blob.getSize(), ")."); - void *ext_blob_ptr = ext_blob->data(); - void *intr_blob_ptr = intr_blob.getData(); - DEBUG_LOG(output_index, " @ ", intr_blob_ptr, " -> ", ext_blob_ptr, " zero-copy: ", intr_blob_ptr == ext_blob_ptr, " graph ", this, "\r\n"); + void* ext_blob_ptr = ext_blob->data(); + void* intr_blob_ptr = intr_blob.getData(); + DEBUG_LOG(output_index, + " @ ", + intr_blob_ptr, + " -> ", + ext_blob_ptr, + " zero-copy: ", + intr_blob_ptr == ext_blob_ptr, + " graph ", + this, + "\r\n"); // That is the same memory. No need to copy - if (ext_blob_ptr == intr_blob_ptr) continue; + if (ext_blob_ptr == intr_blob_ptr) + continue; if (actualDesc->getPrecision() == element::string) { StringMemory outBloMem(getEngine(), expected_desc_ptr, ext_blob_ptr); @@ -1077,7 +1136,10 @@ void Graph::PullOutputData(std::unordered_map>& Memory outBloMem(getEngine(), expected_desc_ptr, ext_blob_ptr, false); outBloMem.load(intr_blob, false); } else { - OPENVINO_ASSERT(srcPrec == dstPrec, "The precision of the CPU output tensor index", output_index, " is different from the external one"); + OPENVINO_ASSERT(srcPrec == dstPrec, + "The precision of the CPU output tensor index", + output_index, + " is different from the external one"); size_t size_to_copy = intr_blob.getSize(); cpu_parallel_memcpy(ext_blob_ptr, intr_blob_ptr, size_to_copy); } @@ -1108,7 +1170,8 @@ namespace { class UpdateNodesSeq { public: - explicit UpdateNodesSeq(std::vector& executableGraphNodes) : m_executableGraphNodes(executableGraphNodes) {} + explicit UpdateNodesSeq(std::vector& executableGraphNodes) + : m_executableGraphNodes(executableGraphNodes) {} void operator()(size_t stopIndx) { for (; prepareCounter < stopIndx; ++prepareCounter) { @@ -1126,7 +1189,7 @@ class UpdateNodesSeq { }; #if (OV_THREAD == OV_THREAD_SEQ) - using UpdateNodes = UpdateNodesSeq; +using UpdateNodes = UpdateNodesSeq; #endif #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_OMP) @@ -1143,7 +1206,8 @@ class UpdateNodesSeq { class UpdateNodesBase { public: - explicit UpdateNodesBase(std::vector& executableGraphNodes) : m_executableGraphNodes(executableGraphNodes) {} + explicit UpdateNodesBase(std::vector& executableGraphNodes) + : m_executableGraphNodes(executableGraphNodes) {} void updateShapes(size_t node_indx, size_t stop_indx) { try { for (size_t i = node_indx; i < stop_indx; i++) { @@ -1153,8 +1217,7 @@ class UpdateNodesBase { } m_prepareCounter.store(i, ov_memory_order_release); } - } - catch(...) { + } catch (...) { m_completion.store(true, ov_memory_order_relaxed); throw; } @@ -1185,13 +1248,16 @@ class UpdateNodesBase { std::vector& m_executableGraphNodes; }; -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) -#if (TBB_VERSION_MAJOR > 2020) +# if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +# if (TBB_VERSION_MAJOR > 2020) template class AsyncTask : public tbb::detail::d1::task { public: - AsyncTask(Body& body, tbb::detail::d1::wait_context& wait, size_t node_indx, size_t stop_indx) : - m_body(body), m_wait(wait), m_node_indx(node_indx), m_stop_indx(stop_indx) {} + AsyncTask(Body& body, tbb::detail::d1::wait_context& wait, size_t node_indx, size_t stop_indx) + : m_body(body), + m_wait(wait), + m_node_indx(node_indx), + m_stop_indx(stop_indx) {} task* execute(tbb::detail::d1::execution_data&) override { m_body(m_node_indx, m_stop_indx); m_wait.release(); @@ -1235,11 +1301,14 @@ class UpdateNodes : public UpdateNodesBase { private: tbb::task_group_context ctx; }; -#else +# else template class AsyncTask : public tbb::task { public: - AsyncTask(Body& body, size_t node_indx, size_t stop_indx) : m_body(body), m_node_indx(node_indx), m_stop_indx(stop_indx) {} + AsyncTask(Body& body, size_t node_indx, size_t stop_indx) + : m_body(body), + m_node_indx(node_indx), + m_stop_indx(stop_indx) {} task* execute() override { m_body(m_node_indx, m_stop_indx); return nullptr; @@ -1257,28 +1326,30 @@ class UpdateNodes : public UpdateNodesBase { void operator()(size_t stopIndx) { m_completion.store(false); auto startCounter = m_prepareCounter.load(); - tbb::task& root = *new(tbb::task::allocate_root()) tbb::empty_task; - root.set_ref_count(3); // two for children and one preserved + tbb::task& root = *new (tbb::task::allocate_root()) tbb::empty_task; + root.set_ref_count(3); // two for children and one preserved auto task1 = [this](size_t start, size_t stop) { this->updateShapes(start, stop); }; - AsyncTask& a = *new (root.allocate_child()) AsyncTask(task1, startCounter, stopIndx); + AsyncTask& a = + *new (root.allocate_child()) AsyncTask(task1, startCounter, stopIndx); auto task2 = [this](size_t start, size_t stop) { this->updateDynParams(start, stop); }; - AsyncTask& b = *new (root.allocate_child()) AsyncTask(task2, startCounter, stopIndx); + AsyncTask& b = + *new (root.allocate_child()) AsyncTask(task2, startCounter, stopIndx); - b.set_affinity(2); // slot 1 plus 1 + b.set_affinity(2); // slot 1 plus 1 tbb::task::spawn(b); root.spawn_and_wait_for_all(a); } }; -#endif -#endif +# endif +# endif -#if (OV_THREAD == OV_THREAD_OMP) +# if (OV_THREAD == OV_THREAD_OMP) class UpdateNodes : public UpdateNodesBase { public: using UpdateNodesBase::UpdateNodesBase; @@ -1293,14 +1364,15 @@ class UpdateNodes : public UpdateNodesBase { if (origin_nested_levels < 2) { set_max_nested_levels(2); } - // In OpenMP, an exception that is thrown in a parallel region must be caught and handled in the same region by the same thread. - // Therefore, need to pass the error message and throw a new exception outside the parallel region. + // In OpenMP, an exception that is thrown in a parallel region must be caught and handled in the same region by + // the same thread. Therefore, need to pass the error message and throw a new exception outside the parallel + // region. const char* what = nullptr; - #pragma omp parallel - #pragma omp sections +# pragma omp parallel +# pragma omp sections { - #pragma omp section +# pragma omp section { try { updateDynParams(startCounter, stopIndx); @@ -1310,7 +1382,7 @@ class UpdateNodes : public UpdateNodesBase { what = "[ CPU ] Could not update dynamic parameters."; } } - #pragma omp section +# pragma omp section { try { updateShapes(startCounter, stopIndx); @@ -1329,18 +1401,18 @@ class UpdateNodes : public UpdateNodesBase { OPENVINO_ASSERT(what == nullptr, what); } }; -#endif +# endif #endif -} // namespace +} // namespace /* group all the profiling macros into a single one * to avoid cluttering a core logic */ #define VERBOSE_PERF_DUMP_ITT_DEBUG_LOG(ittScope, node, config) \ - VERBOSE(node, config.debugCaps.verbose); \ - PERF(node, config.collectPerfCounters); \ - DUMP(node, config.debugCaps, infer_count); \ - OV_ITT_SCOPED_TASK(ittScope, node->profiling.execute); \ + VERBOSE(node, config.debugCaps.verbose); \ + PERF(node, config.collectPerfCounters); \ + DUMP(node, config.debugCaps, infer_count); \ + OV_ITT_SCOPED_TASK(ittScope, node->profiling.execute); \ DEBUG_LOG(*node); inline void Graph::ExecuteNode(const NodePtr& node, SyncInferRequest* request, int numaId) const { @@ -1362,7 +1434,7 @@ inline void Graph::ExecuteNodeWithCatch(const NodePtr& node, SyncInferRequest* r } } -template +template void Graph::InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update) { size_t inferCounter = 0; for (auto stopIndx : m_executableSyncNodesInds) { @@ -1410,17 +1482,20 @@ void Graph::Infer(SyncInferRequest* request) { InferStatic(request, numaId); break; default: - OPENVINO_ASSERT(IsReady(), "Wrong state of the ov::intel_cpu::Graph. Topology is not ready: ", static_cast(status)); + OPENVINO_ASSERT(IsReady(), + "Wrong state of the ov::intel_cpu::Graph. Topology is not ready: ", + static_cast(status)); } - if (infer_count != -1) infer_count++; + if (infer_count != -1) + infer_count++; } void Graph::SortTopologically() { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::SortTopologically"); // Set execIndex of all nodes to default invaild value - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { node->execIndex = -1; } @@ -1433,7 +1508,7 @@ void Graph::SortTopologically() { std::function visit; visit = [&execIndexCnt, &sorted, &visit](const NodePtr node) { if (node->execIndex >= 0) - return; // already visited + return; // already visited for (size_t i = 0; i < node->getParentEdges().size(); i++) { visit(node->getParentEdgeAt(i)->getParent()); @@ -1467,7 +1542,7 @@ void Graph::SortTopologically() { // Sort in / out child edges by port index // Make first N (N == port_num) edge indexes match with port index - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { int port_num = node->outputShapes.size(); std::vector res(port_num); @@ -1512,10 +1587,7 @@ void Graph::GetPerfData(std::vector& perfMap) const { } } -void Graph::CreateEdge(const NodePtr& parent, - const NodePtr& child, - int parentPort, - int childPort) { +void Graph::CreateEdge(const NodePtr& parent, const NodePtr& child, int parentPort, int childPort) { assert(parentPort >= 0 && childPort >= 0); auto edge = std::make_shared(parent, child, parentPort, childPort); @@ -1539,24 +1611,28 @@ void Graph::AddNode(NodePtr node) { graphNodes.push_back(node); } -void Graph::DropNode(const NodePtr &node) { +void Graph::DropNode(const NodePtr& node) { auto children = node->childEdges; auto parents = node->parentEdges; for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; const int inNum = p_edge->getInputNum(); RemoveEdge(p_edge); for (size_t j = 0; j < children.size(); j++) { auto c_edge = children[j].lock(); - if (!c_edge) continue; + if (!c_edge) + continue; auto child = c_edge->getChild(); - if (!child) continue; + if (!child) + continue; const int outNum = c_edge->getOutputNum(); RemoveEdge(c_edge); @@ -1565,31 +1641,37 @@ void Graph::DropNode(const NodePtr &node) { } } -void Graph::DropDWConvNode(const NodePtr &node) { +void Graph::DropDWConvNode(const NodePtr& node) { auto children = node->childEdges; auto parents = node->parentEdges; auto parentConvEdge = parents[0].lock(); - if (!parentConvEdge) return; + if (!parentConvEdge) + return; auto parentConv = parentConvEdge->getParent(); - if (!parentConv) return; + if (!parentConv) + return; parentConv->outputShapes[0] = node->outputShapes[0]; for (size_t i = 0; i < 1; i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; const int inNum = p_edge->getInputNum(); RemoveEdge(p_edge); for (size_t j = 0; j < children.size(); j++) { auto c_edge = children[j].lock(); - if (!c_edge) continue; + if (!c_edge) + continue; auto child = c_edge->getChild(); - if (!child) continue; + if (!child) + continue; const int outNum = c_edge->getOutputNum(); RemoveEdge(c_edge); @@ -1599,9 +1681,11 @@ void Graph::DropDWConvNode(const NodePtr &node) { for (size_t i = 1; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; const int inNum = p_edge->getInputNum(); const int portCandidate = p_edge->getOutputNum(); @@ -1615,14 +1699,20 @@ void Graph::DropDWConvNode(const NodePtr &node) { } void Graph::RemoveDroppedNodes() { - graphNodes.erase(std::remove_if(graphNodes.begin(), graphNodes.end(), - [](const NodePtr& node){ return node->isDropped(); }), + graphNodes.erase(std::remove_if(graphNodes.begin(), + graphNodes.end(), + [](const NodePtr& node) { + return node->isDropped(); + }), graphNodes.end()); } void Graph::RemoveDroppedEdges() { - graphEdges.erase(std::remove_if(graphEdges.begin(), graphEdges.end(), - [](const EdgePtr& node){ return node->isDropped(); }), + graphEdges.erase(std::remove_if(graphEdges.begin(), + graphEdges.end(), + [](const EdgePtr& node) { + return node->isDropped(); + }), graphEdges.end()); } @@ -1631,19 +1721,28 @@ NodePtr Graph::InsertReorder(EdgePtr edge, const MemoryDesc& inDesc, const MemoryDesc& outDesc, bool isOptimized, - const std::vector & src_perm) { + const std::vector& src_perm) { auto reorder = std::make_shared(inDesc, outDesc, layerName, m_context); reorder->setOptimized(isOptimized); reorder->setSrcPermutation(src_perm); DEBUG_LOG(reorder->getName(), " edge=", *edge, " isOptimized=", isOptimized); - DEBUG_LOG(" inDesc: ", inDesc.getShape().toString(), inDesc.getPrecision().get_type_name(), " ", inDesc.serializeFormat()); - DEBUG_LOG(" outDesc: ", outDesc.getShape().toString(), outDesc.getPrecision().get_type_name(), " ", outDesc.serializeFormat()); + DEBUG_LOG(" inDesc: ", + inDesc.getShape().toString(), + inDesc.getPrecision().get_type_name(), + " ", + inDesc.serializeFormat()); + DEBUG_LOG(" outDesc: ", + outDesc.getShape().toString(), + outDesc.getPrecision().get_type_name(), + " ", + outDesc.serializeFormat()); InsertNode(edge, reorder, true); // Using the method Edge::getDesc() we can check that input and output tensor descriptors are equal. - // Due to the specificity of GraphOptimizer::MergeTransposeAndReorder() that isOptimized flag uses, we shouldn't do these checks. + // Due to the specificity of GraphOptimizer::MergeTransposeAndReorder() that isOptimized flag uses, we shouldn't do + // these checks. if (!isOptimized) { reorder->getParentEdgeAt(0)->getDesc(); reorder->getChildEdgeAt(0)->getDesc(); @@ -1692,10 +1791,10 @@ void Graph::EnforceInferencePrecision() { const auto inferPrec = getConfig().inferencePrecision; if (one_of(inferPrec, element::f32, element::undefined, ov::element::f16)) - return; // nothing to do, only precision reduction is currently allowed + return; // nothing to do, only precision reduction is currently allowed #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) if (inferPrec == ov::element::f16) - return; // precision of configured by ov::pass::ConvertPrecision + return; // precision of configured by ov::pass::ConvertPrecision #endif std::function& skipNodes)> searchForNodesToSkip; searchForNodesToSkip = [&](const NodePtr& node, std::unordered_set& skipNodes) -> void { @@ -1703,35 +1802,35 @@ void Graph::EnforceInferencePrecision() { const auto& parent = node->getParentEdgeAt(i)->getParent(); if (inferPrec == ov::element::bf16) { /* list of node types that must be forced to be executed in BF16 precision - * because of performance gains */ + * because of performance gains */ if (one_of(parent->getType(), - Type::Convolution, // conv nets - Type::FullyConnected, // conv / bert nets - Type::RNNCell, // recurent nets - Type::RNNSeq, // recurent nets - Type::MatMul, // bert nets - Type::ROIPooling, // object detection nets - Type::Interpolate, // super resolution nets - Type::PagedAttention, // page attention - Type::QKVProjection, - Type::LLMMLP)) - continue; // stop at significant nodes + Type::Convolution, // conv nets + Type::FullyConnected, // conv / bert nets + Type::RNNCell, // recurent nets + Type::RNNSeq, // recurent nets + Type::MatMul, // bert nets + Type::ROIPooling, // object detection nets + Type::Interpolate, // super resolution nets + Type::PagedAttention, // page attention + Type::QKVProjection, + Type::LLMMLP)) + continue; // stop at significant nodes } else if (inferPrec == ov::element::f16) { /* list of node types that must be forced to be executed in FP16 precision - * because of performance gains */ + * because of performance gains */ if (one_of(parent->getType(), - Type::Convolution, // conv nets - Type::Deconvolution, // deconv - Type::FullyConnected, // conv / bert nets - Type::MatMul, // bert nets - Type::Pooling, - Type::MVN)) - continue; // stop at significant nodes + Type::Convolution, // conv nets + Type::Deconvolution, // deconv + Type::FullyConnected, // conv / bert nets + Type::MatMul, // bert nets + Type::Pooling, + Type::MVN)) + continue; // stop at significant nodes } const auto res = skipNodes.insert(parent); - if (res.second) // node not visited yet + if (res.second) // node not visited yet searchForNodesToSkip(parent, skipNodes); } }; @@ -1772,10 +1871,10 @@ void Graph::EnforceInferencePrecision() { // kvcache of PagedAttention should be written directly if (node->getType() == Type::PagedAttention && (inPort == 3 || inPort == 4)) return true; - const auto &parent = node->getParentEdgeAt(inPort)->getParent(); + const auto& parent = node->getParentEdgeAt(inPort)->getParent(); /* Skip BF16 enforcement for nodes after Constant Inputs for maintaining precision for fusing. - * Element type conversion to bf16 is done automatically, if convolution follows up after Constant Inputs - * and activation is bf16 */ + * Element type conversion to bf16 is done automatically, if convolution follows up after Constant + * Inputs and activation is bf16 */ if (parent->getType() == Type::Input && parent->isConstant() && // Concatenation node is exception because it doesn't change an accuracy for BF16 activation node->getType() != Type::Concatenation) @@ -1815,7 +1914,7 @@ void Graph::EnforceInferencePrecision() { // exclude Convert before Range since it may cause precision loss when integter type to LP. // TODO: Incorrect subgraph is generated by ONNX FE + ticket 117861. - const auto &child = node->getChildEdgeAt(i)->getChild(); + const auto& child = node->getChildEdgeAt(i)->getChild(); if (child->getType() == Type::Range && node->getType() == Type::Convert) continue; // skip second output of PagedAttention @@ -1841,9 +1940,24 @@ std::shared_ptr Graph::dump() const { return dump_graph_as_ie_ngraph_net(*this); } -const std::unordered_map& Graph::getInternalStateNodes() const { - return m_context->getMemoryStatesRegister()->getMemoryStates(); +std::vector Graph::memoryStates() const { + std::vector resultVector; + + for (auto&& item : m_context->getMemoryStatesRegister()->getMemoryStates()) { + resultVector.emplace_back(item.second->makeState()); + } + return resultVector; +} + +void Graph::assignStates(const std::vector& states) { + auto&& inputStateNodes = m_context->getMemoryStatesRegister()->getMemoryStates(); + for (const auto& state : states) { + auto itr = inputStateNodes.find(state->get_name()); + if (itr != inputStateNodes.end()) { + itr->second->assignState(state); + } + } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index d50ccc152c9186..5d5d5b335a36f2 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -4,22 +4,21 @@ #pragma once +#include +#include +#include +#include + #include "config.h" #include "cpu_memory.h" -#include "nodes/input.h" -#include "openvino/core/node_vector.hpp" -#include "openvino/runtime/profiling_info.hpp" -#include "node.h" #include "edge.h" #include "graph_context.h" #include "memory_control.hpp" +#include "memory_state.h" +#include "node.h" +#include "nodes/input.h" +#include "openvino/core/node_vector.hpp" #include "openvino/runtime/profiling_info.hpp" - -#include -#include -#include -#include - #include "openvino/runtime/so_ptr.hpp" #include "proxy_mem_blk.h" @@ -29,7 +28,7 @@ namespace intel_cpu { class SyncInferRequest; namespace node { class MemoryStateNode; -} // namespace node +} // namespace node class Graph { public: @@ -61,15 +60,15 @@ class Graph { return IsStatic() || IsDynamic(); } - const Config & getConfig() const { + const Config& getConfig() const { return m_context->getConfig(); } - template - void CreateGraph(NET &model, const GraphContext::CPtr context); + template + void CreateGraph(NET& model, const GraphContext::CPtr context); - void CreateGraph(const std::vector &graphNodes, - const std::vector &graphEdges, + void CreateGraph(const std::vector& graphNodes, + const std::vector& graphEdges, const GraphContext::CPtr context, std::string name); @@ -89,28 +88,42 @@ class Graph { return _name; } - std::map& GetInputNodesMap() { - return inputNodesMap; + NodePtr getInputNodeByIndex(std::size_t index) { + auto input = inputNodesMap.find(index); + if (input == inputNodesMap.end()) + return nullptr; + return input->second; } - std::map& GetOutputNodesMap() { - return outputNodesMap; + NodePtr getOutputNodeByIndex(std::size_t index) { + auto output = outputNodesMap.find(index); + if (output == outputNodesMap.end()) + return nullptr; + return output->second; } - NodePtr getInputNodeByIndex(const std::size_t &index) { + NodeConstPtr getInputNodeByIndex(std::size_t index) const { auto input = inputNodesMap.find(index); if (input == inputNodesMap.end()) - OPENVINO_THROW("CPU execution graph doesn't contain input node with index: ", index); + return nullptr; return input->second; } - NodePtr getOutputNodeByIndex(const std::size_t &index) { + NodeConstPtr getOutputNodeByIndex(std::size_t index) const { auto output = outputNodesMap.find(index); if (output == outputNodesMap.end()) - OPENVINO_THROW("CPU execution graph doesn't contain output node with index: ", index); + return nullptr; return output->second; } + size_t inputsNumber() const { + return inputNodesMap.size(); + } + + size_t outputsNumber() const { + return outputNodesMap.size(); + } + dnnl::engine getEngine() const { return m_context->getEngine(); } @@ -119,12 +132,12 @@ class Graph { return m_context; } - void GetPerfData(std::vector &perfMap) const; + std::vector memoryStates() const; + void assignStates(const std::vector& state); - void CreateEdge(const NodePtr& parent, - const NodePtr& child, - int parentPort = 0, - int childPort = 0); + void GetPerfData(std::vector& perfMap) const; + + void CreateEdge(const NodePtr& parent, const NodePtr& child, int parentPort = 0, int childPort = 0); void RemoveEdge(const EdgePtr& edge); void RemoveDroppedNodes(); void RemoveDroppedEdges(); @@ -134,9 +147,9 @@ class Graph { /** * @brief Insert Reorder node at the edge-specified location. - * The Reorder node must be inserted in case when there are inplace conflicts or the input and output tensor descriptors do not match. - * The Reorder node rearranges the elements in memory according to inDesc and outDesc, or reinterprets memory descriptor without - * rearrangement of elements if isOptimized is true. + * The Reorder node must be inserted in case when there are inplace conflicts or the input and output tensor + * descriptors do not match. The Reorder node rearranges the elements in memory according to inDesc and outDesc, or + * reinterprets memory descriptor without rearrangement of elements if isOptimized is true. * @param edge * pointer to the edge in the graph where Reorder node will be inserted * @param layerName @@ -153,14 +166,18 @@ class Graph { * pointer to the blob containing scales * @return pointer to the new Reorder node. */ - NodePtr InsertReorder(EdgePtr edge, std::string layerName, const MemoryDesc& inDesc, - const MemoryDesc& outDesc, bool isOptimized = false, const std::vector & src_perm = {}); + NodePtr InsertReorder(EdgePtr edge, + std::string layerName, + const MemoryDesc& inDesc, + const MemoryDesc& outDesc, + bool isOptimized = false, + const std::vector& src_perm = {}); /** * @brief Insert Node at the edge-specified location. - * This method supports two regimes. First, the node is inserted without initialization (i.e. supported descriptors initialization, - * supported primitive descriptors selection, etc.), which can be useful after the ResolveEdgeConflicts() completes. The second is just inserting the - * node without initialization. + * This method supports two regimes. First, the node is inserted without initialization (i.e. supported descriptors + * initialization, supported primitive descriptors selection, etc.), which can be useful after the + * ResolveEdgeConflicts() completes. The second is just inserting the node without initialization. * @param edge * pointer to the edge in the graph where the node will be inserted * @param node @@ -173,10 +190,10 @@ class Graph { /** * @brief Insert Node between two specified nodes. - * This procedure creates two edges that link the parent and child nodes to the inserted one and adds all created objects to the graph. - * This method supports two regimes. First, the node is inserted without initialization (i.e. supported descriptors initialization, - * supported primitive descriptors selection, etc.), which can be useful after the ResolveEdgeConflicts() completes. The second is just inserting the - * node without initialization. + * This procedure creates two edges that link the parent and child nodes to the inserted one and adds all created + * objects to the graph. This method supports two regimes. First, the node is inserted without initialization (i.e. + * supported descriptors initialization, supported primitive descriptors selection, etc.), which can be useful after + * the ResolveEdgeConflicts() completes. The second is just inserting the node without initialization. * @param parent * pointer to the parent node * @param child @@ -193,7 +210,9 @@ class Graph { std::shared_ptr dump() const; - void ResetInferCount() { infer_count = 0; } + void ResetInferCount() { + infer_count = 0; + } void SortTopologically(); @@ -201,8 +220,6 @@ class Graph { return graphHasDynamicInput; } - const std::unordered_map& getInternalStateNodes() const; - /** * Init graph using \p model, \p context, \p inputConfigs and \p outputConfigs */ @@ -215,9 +232,9 @@ class Graph { * Activate execution graph using \p externalInputMemory and \p externalOutputMemory */ void Activate(const std::vector& externalInputMemory = {}, - const std::vector& externalOutputMemory = {}); + const std::vector& externalOutputMemory = {}); - const std::unordered_map& getOutputNodesMemBlocksMap() const { + const std::unordered_map& getOutputNodesMemBlocksMap() { return outputNodesMemBlocksMap; } @@ -231,7 +248,7 @@ class Graph { graphEdges.clear(); m_executableSyncNodesInds.clear(); } - Status status { Status::NotReady }; + Status status{Status::NotReady}; // For dumping purposes. -1 - no counting, all other positive // values mean increment it within each Infer() call @@ -244,7 +261,7 @@ class Graph { bool graphHasDynamicInput = false; - void Replicate(const std::shared_ptr &subgraph, + void Replicate(const std::shared_ptr& subgraph, const std::vector& inputConfigs = {}, const std::vector& outputConfigs = {}); @@ -281,10 +298,10 @@ class Graph { void ExecuteNode(const NodePtr& node, SyncInferRequest* request = nullptr, int numaId = -1) const; void InferStatic(SyncInferRequest* request, int numaId); - template + template void InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update); - friend std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph); + friend std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph& graph); private: using event_t = void (Graph::*)(void); diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp index e200766fa4791c..462cdab2a9b5c0 100644 --- a/src/plugins/intel_cpu/src/graph_context.cpp +++ b/src/plugins/intel_cpu/src/graph_context.cpp @@ -1,10 +1,11 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "dnnl_types.h" #include "graph_context.h" -#include "nodes/memory.hpp" + +#include "dnnl_types.h" #include "memory_control.hpp" +#include "nodes/memory.hpp" namespace ov { namespace intel_cpu { @@ -27,6 +28,7 @@ GraphContext::GraphContext(const Config& config, numNumaNodes = 1; if (streamExecutor) { cpuStreamExecutor = std::dynamic_pointer_cast(streamExecutor); + numaNodeId = cpuStreamExecutor ? cpuStreamExecutor->get_numa_node_id() : 0; auto nNumaNodes = get_num_numa_nodes(); if (numNumaNodes < nNumaNodes) numNumaNodes = nNumaNodes; @@ -41,5 +43,5 @@ const dnnl::engine& GraphContext::getEngine() { return eng; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h index db2b126213978c..d13872129325b4 100644 --- a/src/plugins/intel_cpu/src/graph_context.h +++ b/src/plugins/intel_cpu/src/graph_context.h @@ -4,11 +4,11 @@ #pragma once -#include "openvino/runtime/threading/cpu_streams_executor.hpp" -#include "sub_memory_manager.hpp" #include "cache/multi_cache.h" #include "config.h" #include "dnnl_scratch_pad.h" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" +#include "sub_memory_manager.hpp" #include "weights_cache.hpp" namespace ov { @@ -16,7 +16,7 @@ namespace intel_cpu { namespace node { class MemoryStatesRegister; -} // namespace node +} // namespace node class NetworkMemoryControl; @@ -39,17 +39,12 @@ class GraphContext { return weightsCache; } - MultiCachePtr getParamsCache() const { return rtParamsCache; } - DnnlScratchPadPtr getScratchPad(int subStreamID = 0) const { - if (subStreamID < 0) - subStreamID = 0; - if (subStreamID >= numNumaNodes - 1) - subStreamID = numNumaNodes - 1; - return rtScratchPads[subStreamID]; + DnnlScratchPadPtr getScratchPad() const { + return rtScratchPads[numaNodeId]; } const std::vector& getScratchPads() const { @@ -85,7 +80,7 @@ class GraphContext { private: Config config; // network-level config - WeightsSharing::Ptr weightsCache; // per NUMA node caches for sharing weights data + WeightsSharing::Ptr weightsCache; // per NUMA node caches for sharing weights data MultiCachePtr rtParamsCache; // primitive cache DnnlScratchPadPtr rtScratchPad; // scratch pad @@ -94,13 +89,14 @@ class GraphContext { std::vector rtScratchPads; // scratch pad (each sub-stream has its own copy) - ov::threading::IStreamsExecutor::Ptr streamExecutor; // stream executor for current graph + ov::threading::IStreamsExecutor::Ptr streamExecutor; // stream executor for current graph - ov::threading::CPUStreamsExecutor::Ptr cpuStreamExecutor; // cpu stream executor for current graph + ov::threading::CPUStreamsExecutor::Ptr cpuStreamExecutor; // cpu stream executor for current graph std::shared_ptr subMemoryManager; int numNumaNodes = 1; + int numaNodeId = 0; std::shared_ptr memoryStatesRegister; std::shared_ptr networkMemoryControl; diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp index 04c15408743c71..ffd58fdb162899 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.cpp +++ b/src/plugins/intel_cpu/src/graph_dumper.cpp @@ -4,28 +4,28 @@ #include "graph_dumper.h" -#include "dnnl_debug.h" -#include "openvino/pass/manager.hpp" -#include "openvino/pass/serialize.hpp" -#include "openvino/runtime/exec_model_info.hpp" -#include "utils/debug_capabilities.h" - #include +#include #include #include #include #include -#include + +#include "dnnl_debug.h" +#include "openvino/pass/manager.hpp" +#include "openvino/pass/serialize.hpp" +#include "openvino/runtime/exec_model_info.hpp" +#include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { -void serializeToCout(const Graph &graph); -void serializeToXML(const Graph &graph, const std::string& path); +void serializeToCout(const Graph& graph); +void serializeToXML(const Graph& graph, const std::string& path); namespace { -std::map extract_node_metadata(const NodePtr &node) { +std::map extract_node_metadata(const NodePtr& node) { std::map serialization_info; if (node->getType() == Type::Input && node->isConstant()) { @@ -47,7 +47,8 @@ std::map extract_node_metadata(const NodePtr &node) { bool isAllEqual = true; for (size_t i = 1; i < node->getChildEdges().size(); i++) { - if (node->getChildEdgeAt(i - 1)->getMemory().getDesc().getPrecision() != node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision()) { + if (node->getChildEdgeAt(i - 1)->getMemory().getDesc().getPrecision() != + node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision()) { isAllEqual = false; break; } @@ -56,7 +57,8 @@ std::map extract_node_metadata(const NodePtr &node) { // If all output precisions are the same, we store the name only once if (!isAllEqual) { for (size_t i = 1; i < node->getChildEdges().size(); i++) - outputPrecisionsStr += "," + std::string(node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision().get_type_name()); + outputPrecisionsStr += + "," + std::string(node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision().get_type_name()); } } else { // Branch to correctly handle output nodes @@ -107,8 +109,8 @@ std::map extract_node_metadata(const NodePtr &node) { } // namespace -std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { - std::map > node2layer; +std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph& graph) { + std::map> node2layer; ov::ResultVector results; ov::ParameterVector params; @@ -117,7 +119,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { std::map> paramsMap; std::map> resultsMap; - auto get_inputs = [&] (const NodePtr & node) { + auto get_inputs = [&](const NodePtr& node) { auto pr_edges = node->getParentEdges(); ov::OutputVector inputs(pr_edges.size()); @@ -136,10 +138,10 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { return inputs; }; - auto create_ngraph_node = [&](const NodePtr &node) { + auto create_ngraph_node = [&](const NodePtr& node) { bool is_input = false, is_output = false, should_be_hold = false; size_t input_index = -1, output_index = -1; - for (auto && kvp : graph.inputNodesMap) { + for (auto&& kvp : graph.inputNodesMap) { if (kvp.second == node) { is_input = true; input_index = kvp.first; @@ -147,7 +149,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { } } - for (auto && kvp : graph.outputNodesMap) { + for (auto&& kvp : graph.outputNodesMap) { if (kvp.second == node) { is_output = true; output_index = kvp.first; @@ -174,7 +176,8 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { return_node = result; } else { return_node = std::make_shared( - get_inputs(node), node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size()); + get_inputs(node), + node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size()); for (size_t port = 0; port < return_node->get_output_size(); ++port) { auto& desc = node->getChildEdgeAt(port)->getMemory().getDesc(); @@ -186,7 +189,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { to_hold.push_back(return_node); } - for (auto && kvp : meta_data) + for (auto&& kvp : meta_data) return_node->get_rt_info()[kvp.first] = kvp.second; return_node->set_friendly_name(node->getName()); @@ -195,18 +198,18 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { ov::NodeVector nodes; nodes.reserve(graph.graphNodes.size()); - for (auto &node : graph.graphNodes) { // important: graph.graphNodes are in topological order + for (auto& node : graph.graphNodes) { // important: graph.graphNodes are in topological order nodes.emplace_back(create_ngraph_node(node)); node2layer[node] = nodes.back(); } - for (auto && kvp : paramsMap) + for (auto&& kvp : paramsMap) params.push_back(kvp.second); - for (auto && kvp : resultsMap) + for (auto&& kvp : resultsMap) results.push_back(kvp.second); auto holder = !results.empty() ? results[0] : std::make_shared(); - for (auto &node : to_hold) { + for (auto& node : to_hold) { holder->add_control_dependency(node); } @@ -214,7 +217,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { } #ifdef CPU_DEBUG_CAPS -void serialize(const Graph &graph) { +void serialize(const Graph& graph) { const std::string& path = graph.getConfig().debugCaps.execGraphPath; if (path.empty()) @@ -231,19 +234,17 @@ void serialize(const Graph &graph) { } } -void serializeToXML(const Graph &graph, const std::string& path) { +void serializeToXML(const Graph& graph, const std::string& path) { if (path.empty()) return; std::string binPath; ov::pass::Manager manager; - manager.register_pass(path, - binPath, - ov::pass::Serialize::Version::IR_V10); + manager.register_pass(path, binPath, ov::pass::Serialize::Version::IR_V10); manager.run_passes(graph.dump()); } -void serializeToCout(const Graph &graph) { +void serializeToCout(const Graph& graph) { for (const auto& node : graph.GetNodes()) { std::cout << "name: " << node->getName() << " [ "; auto nodeDesc = node->getSelectedPrimitiveDescriptor(); @@ -251,8 +252,7 @@ void serializeToCout(const Graph &graph) { auto& inConfs = nodeDesc->getConfig().inConfs; if (!inConfs.empty()) { std::cout << "in: " << inConfs.front().getMemDesc()->getPrecision().get_type_name() - << "/l=" << inConfs.front().getMemDesc()->serializeFormat() - << "; "; + << "/l=" << inConfs.front().getMemDesc()->serializeFormat() << "; "; } auto& outConfs = nodeDesc->getConfig().outConfs; if (!outConfs.empty()) { @@ -260,11 +260,11 @@ void serializeToCout(const Graph &graph) { << "/l=" << outConfs.front().getMemDesc()->serializeFormat(); } } - std::cout << " ]" << std::endl; + std::cout << " ]" << std::endl; } } -void summary_perf(const Graph &graph) { +void summary_perf(const Graph& graph) { if (!graph.getGraphContext()) { return; } @@ -277,7 +277,7 @@ void summary_perf(const Graph &graph) { std::map perf_by_node; double total_avg = 0; uint64_t total = 0; - for (auto &node : graph.GetNodes()) { // important: graph.graphNodes are in topological order + for (auto& node : graph.GetNodes()) { // important: graph.graphNodes are in topological order double avg = node->PerfCounter().avg(); auto type = node->getTypeStr() + "_" + node->getPrimitiveDescriptorType(); auto name = node->getName(); @@ -296,59 +296,60 @@ void summary_perf(const Graph &graph) { perf_by_node[node] = avg; } - if (total_avg < 1) return; + if (total_avg < 1) + return; std::cout << "======= ENABLE_DEBUG_CAPS:OV_CPU_SUMMARY_PERF ======" << std::endl; - std::cout << "Summary of " << graph.GetName() << " @" << std::hash{}(reinterpret_cast(&graph)) << std::endl; + std::cout << "Summary of " << graph.GetName() << " @" << std::hash{}(reinterpret_cast(&graph)) + << std::endl; std::cout << " Total(us): " << (uint64_t)(total) << std::endl; std::cout << " Total_avg(us): " << (uint64_t)(total_avg) << std::endl; { std::cout << " perf_by_type:" << std::endl; - std::vector > A; + std::vector> A; for (auto& it : perf_by_type) A.push_back(it); - sort(A.begin(), A.end(), - [](std::pair& a, - std::pair& b){ - return a.second > b.second; - }); + sort(A.begin(), A.end(), [](std::pair& a, std::pair& b) { + return a.second > b.second; + }); for (auto& it : A) { std::stringstream ss; - int percentage = static_cast(it.second*100/total_avg); - if (percentage == 0) break; - ss << std::setw(10) << std::right << percentage << " % : " << std::setw(8) << std::right << it.second << "(us) " << it.first << std::endl; + int percentage = static_cast(it.second * 100 / total_avg); + if (percentage == 0) + break; + ss << std::setw(10) << std::right << percentage << " % : " << std::setw(8) << std::right << it.second + << "(us) " << it.first << std::endl; std::cout << ss.str(); } } { std::cout << " perf_by_node:" << std::endl; - std::vector > A; + std::vector> A; for (auto& it : perf_by_node) A.push_back(it); - sort(A.begin(), A.end(), - [](std::pair& a, - std::pair& b){ + sort(A.begin(), A.end(), [](std::pair& a, std::pair& b) { return a.second > b.second; }); for (auto& it : A) { std::stringstream ss; - auto percentage = it.second*100/total_avg; + auto percentage = it.second * 100 / total_avg; auto node = it.first; - if (node->PerfCounter().count() == 0) continue; - if (node->PerfCounter().avg() < 1) continue; + if (node->PerfCounter().count() == 0) + continue; + if (node->PerfCounter().avg() < 1) + continue; ss << std::setw(10) << std::right << std::fixed << std::setprecision(2) << percentage << " % " - << std::setw(8) << std::right << node->PerfCounter().avg() << "(us)x" << node->PerfCounter().count() - << " #" << node->getExecIndex() - << " " << node->getName() - << " " << node->getTypeStr() + "_" + node->getPrimitiveDescriptorType() << std::endl; + << std::setw(8) << std::right << node->PerfCounter().avg() << "(us)x" << node->PerfCounter().count() + << " #" << node->getExecIndex() << " " << node->getName() << " " + << node->getTypeStr() + "_" + node->getPrimitiveDescriptorType() << std::endl; std::cout << ss.str(); } } } -void average_counters(const Graph &graph) { +void average_counters(const Graph& graph) { /** * @todo improve logic for a graph with inner graphs: * - collect counters only for the outer graph if full path is specified @@ -356,11 +357,16 @@ void average_counters(const Graph &graph) { * - _.csv * For example: 0_MyModel.csv */ + + const std::string& path = graph.getConfig().debugCaps.averageCountersPath; + + if (path.empty()) + return; + static int graphIndex = 0; + std::string fileName = path + "_" + std::to_string(graphIndex++) + ".csv"; std::ofstream file; - std::string fileName = graph.getConfig().debugCaps.averageCountersPath + "_" + std::to_string(graphIndex++) + ".csv"; - file.open(fileName); // table structure is identical to the benchmark_app average_counters report @@ -379,18 +385,14 @@ void average_counters(const Graph &graph) { const auto cpuTime = toMs(avg); const auto realTime = cpuTime; - file << node->getName() << ";" - << status << ";" - << node->getTypeStr() << ";" - << node->getPrimitiveDescriptorType() << ";" - << realTime << ";" - << cpuTime << ";" - << "\n"; + file << node->getName() << ";" << status << ";" << node->getTypeStr() << ";" + << node->getPrimitiveDescriptorType() << ";" << realTime << ";" << cpuTime << ";" + << "\n"; return avg; }; - for (auto &node : graph.GetNodes()) { + for (auto& node : graph.GetNodes()) { if (node->isConstant()) continue; @@ -399,11 +401,12 @@ void average_counters(const Graph &graph) { const auto totalMs = toMs(total); - file << "Total;;;;" << totalMs << ";" << totalMs << ";" << "\n"; + file << "Total;;;;" << totalMs << ";" << totalMs << ";" + << "\n"; file.close(); } #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_dumper.h b/src/plugins/intel_cpu/src/graph_dumper.h index 417db7e4c3cdc5..40af2fd44c61e6 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.h +++ b/src/plugins/intel_cpu/src/graph_dumper.h @@ -4,19 +4,19 @@ #pragma once -#include "graph.h" - #include +#include "graph.h" + namespace ov { namespace intel_cpu { -std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph); +std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph& graph); #ifdef CPU_DEBUG_CAPS -void serialize(const Graph &graph); -void summary_perf(const Graph &graph); -void average_counters(const Graph &graph); -#endif // CPU_DEBUG_CAPS +void serialize(const Graph& graph); +void summary_perf(const Graph& graph); +void average_counters(const Graph& graph); +#endif // CPU_DEBUG_CAPS -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 61590b8691f4b2..fe0df309dc32f1 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -4,6 +4,7 @@ #include "graph_optimizer.h" +#include "cpu_types.h" #include "dnnl_extension_utils.h" #include "nodes/bin_conv.h" #include "nodes/common/cpu_convert.h" @@ -22,28 +23,26 @@ #include "nodes/transpose.h" #include "onednn/dnnl.h" #include "openvino/opsets/opset1.hpp" -#include "cpu_types.h" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" // WA for xbyak.h #ifdef _WIN32 -# ifndef _WINSOCKAPI_ -# define _WINSOCKAPI_ -# endif -# ifndef _WINSOCK2API_ -# define _WINSOCK2API_ -#endif +# ifndef _WINSOCKAPI_ +# define _WINSOCKAPI_ +# endif +# ifndef _WINSOCK2API_ +# define _WINSOCK2API_ +# endif #endif -#include "cpu/x64/cpu_isa_traits.hpp" - -#include +#include #include #include #include -#include +#include +#include "cpu/x64/cpu_isa_traits.hpp" #include "itt.h" #include "memory_desc/cpu_memory_desc_utils.h" @@ -55,11 +54,15 @@ namespace intel_cpu { GraphOptimizer::GraphOptimizer() {} -void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { +void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) { // For conv with input zp, canBeExecutedInInt8() check has dependency on input zero point check. - // Also zero point node is the input of computing-intensive nodes. Most others fusing are the output of computing-intensive nodes. - // So Locate the FuseConvolutionAndZeroPoints() as the first optimization. - OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "ApplyCommonGraphOptimizations", "FuseConvolutionAndZeroPoints"); + // Also zero point node is the input of computing-intensive nodes. Most others fusing are the output of + // computing-intensive nodes. So Locate the FuseConvolutionAndZeroPoints() as the first optimization. + OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, + taskChain, + itt::domains::intel_cpu_LT, + "ApplyCommonGraphOptimizations", + "FuseConvolutionAndZeroPoints"); FuseConvolutionAndZeroPoints(graph); graph.RemoveDroppedNodes(); @@ -67,10 +70,6 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { FuseConvMatmulFCDeconvAndDQScales(graph); graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndWeightsDecompression"); - FuseFCAndWeightsDecompression(graph); - graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias"); FuseConvolutionMatMulDeconvAndBias(graph); graph.RemoveDroppedNodes(); @@ -191,7 +190,7 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { graph.RemoveDroppedEdges(); } -void GraphOptimizer::ApplyImplSpecificGraphOptimizations(Graph &graph) { +void GraphOptimizer::ApplyImplSpecificGraphOptimizations(Graph& graph) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "GraphOptimizer::ApplyImplSpecificGraphOptimizations"); DropDoubleReorders(graph); @@ -206,7 +205,7 @@ void GraphOptimizer::ApplyImplSpecificGraphOptimizations(Graph &graph) { graph.RemoveDroppedEdges(); } -void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { +void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isDQScaleGraphPattern = [](NodePtr node) { @@ -215,14 +214,12 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { } auto parentNode = node->getParentEdgeAt(0)->getParent(); auto scaleNode = node->getParentEdgeAt(1)->getParent(); - if (!(parentNode->getType() == Type::Convolution - || parentNode->getType() == Type::MatMul - || parentNode->getType() == Type::Deconvolution - || parentNode->getType() == Type::FullyConnected)) + if (!(parentNode->getType() == Type::Convolution || parentNode->getType() == Type::MatMul || + parentNode->getType() == Type::Deconvolution)) return false; if (!scaleNode->isConstant()) return false; - //Only Fusing scales for INT8 precision. + // Only Fusing scales for INT8 precision. if (!parentNode->canBeExecutedInInt8()) return false; return (parentNode->getParentEdges().size() == 2); @@ -238,8 +235,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { if (!node->getFusedWith().empty() || !scales->getFusedWith().empty()) return false; - const auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(), - nodeOutDims.size()); + const auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(), nodeOutDims.size()); if (nodeOutDims.size() != scalesDims.size() || scalesDims.size() < 2) return false; @@ -266,7 +262,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { if (scalesData == nullptr) OPENVINO_THROW("scalesBlob has not allocated buffer"); auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(), - node->getOutputShapeAtPort(0).getDims().size()); + node->getOutputShapeAtPort(0).getDims().size()); auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), 1, std::multiplies()); node->fuseDQScales(scalesData, scaleSize); return true; @@ -274,16 +270,21 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto mul = graphNodes[i]; - if (!isDQScaleGraphPattern(mul)) continue; + if (!isDQScaleGraphPattern(mul)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvMatmulFCDeconvAndDQScales); auto node = mul->getParentEdgeAt(0)->getParent(); auto scales = mul->getParentEdgeAt(1)->getParent(); - if (!scaleDimsCheck(node, scales)) continue; + if (!scaleDimsCheck(node, scales)) + continue; if (initializeDeQuantizedScales(node, scales)) { - DEBUG_LOG("GraphOptimizer##FusingDQ: Node ##", mul->getName(), " optimized as DQ scales of Node ##", node->getName()); + DEBUG_LOG("GraphOptimizer##FusingDQ: Node ##", + mul->getName(), + " optimized as DQ scales of Node ##", + node->getName()); node->addOriginalLayer(mul->getOriginalLayers()); auto p_edge = mul->getParentEdgeAt(1); graph.RemoveEdge(p_edge); @@ -292,258 +293,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { } } -void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) { - std::set supportedWeightsPrecisions{ - ov::element::u8, ov::element::i8, ov::element::nf4, ov::element::u4, ov::element::i4, ov::element::f4e2m1}; - const std::set supportedDataPrecisions{ov::element::f32, ov::element::bf16}; - auto expectedNode = [](NodePtr node, Type expectedType) { - return node->getType() == expectedType && node->getChildEdges().size() == 1; - }; - -#define SKIP_FUSION_FOR_NODE(node) \ - DEBUG_LOG("FuseFCAndWeightsDecompression can't be applied for node ", node->getName()); \ - continue - - if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2)) - return; - - auto& graphNodes = graph.GetNodes(); - for (size_t i = 0; i < graphNodes.size(); i++) { - const auto fcNode = std::dynamic_pointer_cast(graphNodes[i]); - if (fcNode == nullptr) - continue; - - auto parent = fcNode->getParentEdgeAt(1)->getParent(); - const bool withTranspose = parent->getType() == Type::Transpose; - const NodePtr transposeNode = withTranspose ? parent : nullptr; - if (transposeNode) - parent = transposeNode->getParentEdgeAt(0)->getParent(); - // Compressed weights can be shared between several FC layers - const bool is_shared_decompression = parent->getChildEdges().size() > 1; - - const bool withReshape = parent->getType() == Type::Reshape; - const auto reshapeNode = withReshape ? parent : nullptr; - if (reshapeNode) { - parent = reshapeNode->getParentEdgeAt(0)->getParent(); - } - - const auto multiplyNode = parent; - if (multiplyNode->getType() != Type::Eltwise || multiplyNode->getAlgorithm() != Algorithm::EltwiseMultiply || - !multiplyNode->isConstant()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - CPU_GRAPH_OPTIMIZER_SCOPE(FuseFCAndWeightsDecompression); - const auto mulParent1 = multiplyNode->getParentEdgeAt(1)->getParent(); - NodePtr multiplyParent, multiplyConvertNode, multiplyConstNode; - multiplyParent = mulParent1; - if (multiplyParent->getType() == Type::Convert) { - multiplyConvertNode = multiplyParent; - multiplyParent = multiplyConvertNode->getParentEdgeAt(0)->getParent(); - } - multiplyConstNode = multiplyParent; - if (multiplyConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const bool withMultiplyConvert = multiplyConvertNode != nullptr; - - const auto mulParent0 = multiplyNode->getParentEdgeAt(0)->getParent(); - const bool withSubtract = mulParent0->getAlgorithm() == Algorithm::EltwiseSubtract; - NodePtr subtractNode, subtractConvertNode, subtractConstNode; - if (withSubtract) { - subtractNode = mulParent0; - if (!expectedNode(subtractNode, Type::Eltwise)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - auto subtractParent = subtractNode->getParentEdgeAt(1)->getParent(); - if (subtractParent->getType() == Type::Convert) { - subtractConvertNode = subtractParent; - subtractParent = subtractConvertNode->getParentEdgeAt(0)->getParent(); - } - subtractConstNode = subtractParent; - if (subtractConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - const bool withSubtractConvert = subtractConvertNode != nullptr; - const auto convertNode = withSubtract ? subtractNode->getParentEdgeAt(0)->getParent() : mulParent0; - if (!expectedNode(convertNode, Type::Convert)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const auto weightsNode = convertNode->getParentEdgeAt(0)->getParent(); - if (weightsNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Precision limitations - if (supportedDataPrecisions.find(fcNode->getOriginalInputPrecisionAtPort(0)) == supportedDataPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (supportedWeightsPrecisions.find(weightsNode->getOriginalOutputPrecisionAtPort(0)) == supportedWeightsPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && - !one_of(subtractConstNode->getOriginalOutputPrecisionAtPort(0), weightsNode->getOriginalOutputPrecisionAtPort(0), ov::element::f32)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Shape limitations - const auto weightsShape = weightsNode->getOutputShapeAtPort(0); - if (weightsShape != multiplyNode->getOutputShapeAtPort(0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (reshapeNode && (reshapeNode->getInputShapeAtPort(0).getRank() != 3 || reshapeNode->getOutputShapeAtPort(0).getRank() != 2)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - VectorDims decompressionConstShape; - const auto fcInputWeightsShape = fcNode->getInputShapeAtPort(1); - int groupNum = 1; - // Ordinary case: one decompression group - if (fcInputWeightsShape.getRank() == weightsShape.getRank()) { - const auto& out_channels = fcInputWeightsShape.getDims()[0]; - decompressionConstShape = withTranspose ? VectorDims{1, out_channels} : VectorDims{out_channels, 1}; - } else { - // Group decompression case: last 3 dimension (there could be also prepending '1's in the beginning) of weights shape must be: - // [N, G, O], if transpose = true - // [O, N, G], otherwise. - // O - output channels - // N - number of groups - // G - group size - const auto& weights_dims = weightsShape.getStaticDims(); - const auto& N = withTranspose ? *(weights_dims.rbegin() + 2) : *(weights_dims.rbegin() + 1); - const auto& O = withTranspose ? *weights_dims.rbegin() : *(weights_dims.rbegin() + 2); - // Group decompression is applied by O and N dims - decompressionConstShape = withTranspose ? VectorDims{N, 1, O} : VectorDims{O, N, 1}; - groupNum = N; - } - - auto check_decompression_shape = [&decompressionConstShape](const VectorDims& shape_to_check) { - if (shape_to_check.size() > decompressionConstShape.size()) - return false; - if (std::all_of(shape_to_check.begin(), shape_to_check.end(), [](Dim x) { return x == 1; })) - return true; - const auto comparison_start_pos = decompressionConstShape.size() - shape_to_check.size(); - // in case of different ranks shapes are compared taking into account ranks numpy broadcasting - return std::equal(shape_to_check.begin(), shape_to_check.end(), decompressionConstShape.begin() + comparison_start_pos); - }; - if (!check_decompression_shape(multiplyConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && !check_decompression_shape(subtractConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - const size_t OC = fcInputWeightsShape.getDims()[0]; - const size_t IC = fcInputWeightsShape.getDims()[1]; - // HW specific shape limitations - if (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) && - fcNode->getOriginalInputPrecisionAtPort(0) == ov::element::bf16) { - // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a current solution conditions below are copied - // from OneDNN to make sure correct IP impl will be used since fallback one doesn't support weights decompression feature. - size_t simdWidth = 16; - size_t vnniFactor = 2; - size_t maxSize = 512; - auto amxRow = vnniFactor * simdWidth; - - if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - // OneDNN IP primitive provides limited decompression params support - if (IC % groupNum != 0 || IC / groupNum < 4 || OC == 1) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Fusion processing - auto *multiplyInputNode = dynamic_cast(multiplyConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", multiplyConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionMultiply(multiplyInputNode->getMemoryPtr()); - - if (withSubtract) { - auto *subtractInputNode = dynamic_cast(subtractConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", subtractConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionSubtract(subtractInputNode->getMemoryPtr()); - } - - fcNode->addOriginalLayer(multiplyNode->getOriginalLayers()); - fcNode->addOriginalLayer(convertNode->getOriginalLayers()); - if (withSubtract) - fcNode->addOriginalLayer(subtractNode->getOriginalLayers()); - if (withSubtractConvert) - fcNode->addOriginalLayer(subtractConvertNode->getOriginalLayers()); - if (withMultiplyConvert) - fcNode->addOriginalLayer(multiplyConvertNode->getOriginalLayers()); - - const auto& weightsPrecision = weightsNode->getOriginalOutputPrecisionAtPort(0); - if (withTranspose) { - transposeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - transposeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - if (withReshape) { - reshapeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - reshapeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - fcNode->setOriginalInputPrecisionAtPort(1, weightsPrecision); - - // If decompression subgraph is shared with other nodes, it mustn't be removed. - // In this case, the current FC is reconnected to the weights - if (is_shared_decompression) { - const auto weights_out_edge = weightsNode->getChildEdges()[0].lock(); - const auto fc_weights_path_edge = withTranspose ? transposeNode->getParentEdgeAt(0) - : fcNode->getParentEdgeAt(1); - const auto inNum = weights_out_edge->getInputNum(); - const auto outNum = fc_weights_path_edge->getOutputNum(); - graph.RemoveEdge(fc_weights_path_edge); - // In case of shared group decompression, Reshape node has to be copied for the current FC - if (withReshape) { - const auto& reshapeOutShape = reshapeNode->getOutputShapeAtPort(0).getStaticDims(); - auto reshapeConst = std::make_shared(ov::element::i32, - ov::Shape{reshapeOutShape.size()}, - reshapeOutShape); - auto reshapeDummyInput = std::make_shared(reshapeNode->getOriginalInputPrecisionAtPort(0), - reshapeNode->getInputShapeAtPort(0).toPartialShape()); - const auto reshape = std::make_shared(reshapeDummyInput, reshapeConst, false); - reshape->set_friendly_name(reshapeNode->getName() + "_copy"); - const auto cpuReshape = std::make_shared(reshape, graph.getGraphContext()); - graph.InsertNode(weightsNode, withTranspose ? transposeNode : fcNode, cpuReshape, inNum, outNum, false); - const auto cpuReshapeConst = std::make_shared(reshapeConst, graph.getGraphContext()); - graph.AddNode(cpuReshapeConst); - graph.CreateEdge(cpuReshapeConst, cpuReshape, 0, 1); - } else { - graph.CreateEdge(weightsNode, withTranspose ? transposeNode : fcNode, inNum, outNum); - } - } else { - // If decompression subgraph is not shared with other nodes, it can be removed - if (withSubtract) - graph.RemoveEdge(subtractNode->getParentEdgeAt(1)); - if (withSubtractConvert) { - // SubtractConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = subtractConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(subtractConvertNode->getParentEdgeAt(0)); - } - graph.RemoveEdge(multiplyNode->getParentEdgeAt(1)); - if (withMultiplyConvert) { - // MultiplyConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = multiplyConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(multiplyConvertNode->getParentEdgeAt(0)); - } - - graph.DropNode(convertNode); - if (withSubtract) - graph.DropNode(subtractNode); - graph.DropNode(multiplyNode); - } - DEBUG_LOG("FuseFCAndWeightsDecompression finished for node ", fcNode->getName()); - } -#undef SKIP_FUSION_FOR_NODE -} - -void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { +void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](const NodePtr& node) { @@ -556,16 +306,14 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { return false; if (!deconv) - return (one_of(node->getType(), Type::Convolution, Type::MatMul, Type::FullyConnected) && - node->getParentEdges().size() == 2); + return (one_of(node->getType(), Type::Convolution, Type::MatMul) && node->getParentEdges().size() == 2); else return deconv->canFuseBias(); }; auto isSuitableChildNode = [&](const NodePtr& parentNode, const NodePtr& childNode) { - if (childNode->getAlgorithm() != Algorithm::EltwiseAdd - || !childNode->getFusedWith().empty() - || childNode->getParentEdges().size() != 2) + if (childNode->getAlgorithm() != Algorithm::EltwiseAdd || !childNode->getFusedWith().empty() || + childNode->getParentEdges().size() != 2) return false; auto biasPort = childNode->getParentEdgeAt(0)->getParent() == parentNode ? 1 : 0; @@ -574,10 +322,11 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { return false; const auto parentOutDims = parentNode->getOutputShapeAtPort(0).getDims(); - const auto biasDims = getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(), - parentOutDims.size()); - // TODO [NM]: Legacy ConvBias fusion transformation supports both per-tensor (via explicit broadcasing) and per-channel cases. - // Most of the real models contain per-channel bias, so we need to reavaluate the need to support per-tensor variant. + const auto biasDims = + getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(), parentOutDims.size()); + // TODO [NM]: Legacy ConvBias fusion transformation supports both per-tensor (via explicit broadcasing) and + // per-channel cases. Most of the real models contain per-channel bias, so we need to reavaluate the need to + // support per-tensor variant. if (parentOutDims.size() != biasDims.size() || biasDims.size() < 2) return false; @@ -613,9 +362,11 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (parent == parentNode) { for (size_t j = 0; j < childs.size(); j++) { @@ -625,7 +376,7 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { if (!child) continue; - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -640,7 +391,7 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { graph.CreateEdge(parent, child, inNum, outNum); } } else { - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -654,48 +405,57 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { // ONEDNN Conv, Deconv, FC would need the bias to be flatten into 1D tensor. // Usually the bias output shape would be normalized to align rank with Conv/Deconv/FC output. // To avoid duplicate reshape WA code in nodes, here we flatten the shape. - // Most bias nodes are const Input and bias memory primitive has been initialized as const memory when constructing CPU Input node. - // Const memory is not allowed to be modified after initialized. It means we can't redefine const bias memory primitive. - // So let's insert a reshape node to flatten the bias shape into 1D and const folding node will be executed during the compiling stage. - const bool needReshape = (targetNode->getType() != Type::MatMul && - biasOutputShape.getRank() != 1); + // Most bias nodes are const Input and bias memory primitive has been initialized as const memory when + // constructing CPU Input node. Const memory is not allowed to be modified after initialized. It means + // we can't redefine const bias memory primitive. So let's insert a reshape node to flatten the bias + // shape into 1D and const folding node will be executed during the compiling stage. + const bool needReshape = (targetNode->getType() != Type::MatMul && biasOutputShape.getRank() != 1); if (needReshape) { // Bias -> Reshape -> Conv/Deconv/FC const VectorDims flattenShape = {biasOutputShape.getElementsCount()}; // Construct Ngraph Reshape node and CPU Reshape node. - auto reshapeConstInput = std::make_shared(ov::element::i32, ov::Shape{1}, flattenShape); - auto reshapeDummyInput = std::make_shared( - biasNode->getOriginalOutputPrecisionAtPort(0), - biasOutputShape.toPartialShape()); - const auto reshape = std::make_shared(reshapeDummyInput, reshapeConstInput, false); + auto reshapeConstInput = + std::make_shared(ov::element::i32, ov::Shape{1}, flattenShape); + auto reshapeDummyInput = + std::make_shared(biasNode->getOriginalOutputPrecisionAtPort(0), + biasOutputShape.toPartialShape()); + const auto reshape = + std::make_shared(reshapeDummyInput, reshapeConstInput, false); reshape->set_friendly_name(biasNode->getName() + "_flatten_reshape"); - const auto cpuReshapeNode = std::make_shared(reshape, graph.getGraphContext()); + const auto cpuReshapeNode = + std::make_shared(reshape, graph.getGraphContext()); // Insert Reshape between bias node and Conv/Deconv/FC graph.InsertNode(biasNode, targetNode, cpuReshapeNode, inNum, outNum, false); // Insert the Reshape const input node and edge into CPU graph. - const auto cpuReshapeConstInput = std::make_shared(reshapeConstInput, graph.getGraphContext()); + const auto cpuReshapeConstInput = + std::make_shared(reshapeConstInput, graph.getGraphContext()); graph.AddNode(cpuReshapeConstInput); graph.CreateEdge(cpuReshapeConstInput, cpuReshapeNode, 0, 1); - DEBUG_LOG("GraphOptimizer##FusingBias:Flatten Bias node from shape ", PartialShape{biasOutputShape.getDims()}, - " to ", PartialShape{flattenShape}); + DEBUG_LOG("GraphOptimizer##FusingBias:Flatten Bias node from shape ", + PartialShape{biasOutputShape.getDims()}, + " to ", + PartialShape{flattenShape}); // Update bias output shape to be flatten shape. biasOutputShape = Shape{flattenShape}; } else { // Bias is connected as input edge. graph.CreateEdge(biasNode, targetNode, inNum, outNum); } - //Add the Bias inputshape into conv/FC/Deconv/Matmul. + // Add the Bias inputshape into conv/FC/Deconv/Matmul. targetNode->inputShapes.push_back(biasOutputShape); } } - DEBUG_LOG("GraphOptimizer##FusingBias:Node ##: ", childNode->getName(), " initialize as Bias of Node ##", parentNode->getName()); + DEBUG_LOG("GraphOptimizer##FusingBias:Node ##: ", + childNode->getName(), + " initialize as Bias of Node ##", + parentNode->getName()); parentNode->addOriginalLayer(childNode->getOriginalLayers()); parentNode->addOriginalInputPrecision(childNode->getOriginalInputPrecisionAtPort(biasPort)); graph.DropNode(childNode); } } -void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -741,7 +501,7 @@ void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) { childNode->fuseInto(parentNode); auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::Deconvolution) continue; @@ -753,7 +513,7 @@ void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { +void GraphOptimizer::FuseMultiplyAndAdd(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableSecondInput = [](const NodePtr& node, VectorDims dataDims) { @@ -765,9 +525,9 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { auto getChannelAxis = [](const VectorDims& dims) { auto channelAxis = -1; - for (size_t i = 0; i < dims.size(); i ++) { + for (size_t i = 0; i < dims.size(); i++) { if (dims[i] != 1) { - if (channelAxis != -1) // more than one axis is != 1 + if (channelAxis != -1) // more than one axis is != 1 return -1; else channelAxis = i; @@ -795,11 +555,13 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { }; auto isSuitableChildNode = [&](const NodePtr& parentNode, const NodePtr& childNode) { - if (childNode->getAlgorithm() != Algorithm::EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2) + if (childNode->getAlgorithm() != Algorithm::EltwiseAdd || !childNode->getFusedWith().empty() || + childNode->getParentEdges().size() != 2) return false; - return isSuitableSecondInput(childNode->getParentEdgeAt(1)->getParent(), childNode->getInputShapeAtPort(0).getDims()) && - parentNode->canFuse(childNode); + return isSuitableSecondInput(childNode->getParentEdgeAt(1)->getParent(), + childNode->getInputShapeAtPort(0).getDims()) && + parentNode->canFuse(childNode); }; auto parent = graphNodes.begin(); @@ -825,9 +587,11 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (parent == parentNode) { for (size_t j = 0; j < childs.size(); j++) { @@ -837,7 +601,7 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { if (!child) continue; - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -852,7 +616,7 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { graph.CreateEdge(parent, child, inNum, outNum); } } else { - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -908,9 +672,11 @@ void GraphOptimizer::MergeConvertAndScaleShift(Graph& graph) { const auto parents = parentNode->parentEdges; for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (!parentNode->childEdges[0].lock()) continue; @@ -944,8 +710,8 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) { return; #endif - // This optimization fuses Convert (fp16 -> bf16/fp32) on weights directly to FC input to allow precision conversion handling based on internal logic - // (e.g. fuse conversion with weights reordering) + // This optimization fuses Convert (fp16 -> bf16/fp32) on weights directly to FC input to allow precision conversion + // handling based on internal logic (e.g. fuse conversion with weights reordering) auto& graphNodes = graph.GetNodes(); for (const auto& fullyConnected : graphNodes) { if (fullyConnected->getType() != Type::FullyConnected) { @@ -978,16 +744,13 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { return; #endif - // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with reordering in FC node + // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with + // reordering in FC node auto& graphNodes = graph.GetNodes(); auto isSuitablePattern = [](NodePtr parent) { - bool res = true && parent->getType() == Type::Transpose - && parent->getChildEdges().size() == 1 - && parent->getChildEdgeAt(0)->getOutputNum() == 1 - && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected - && parent->getOutputShapeAtPort(0).getRank() == 2 - && parent->isConstant(); + bool res = true && parent->getType() == Type::Transpose && parent->getChildEdges().size() == 1 && + parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected && parent->isConstant(); return res; }; @@ -1002,7 +765,7 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { } } -void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableConvNode = [](NodePtr node) { @@ -1035,9 +798,10 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { return false; // The plug-in doesn't support FP32 convolution with input/weights zero points. - // In case weights are in FP32 (or we have zero points on weights which are not supported by INT8 convolution) we cannot use - // INT8 implementation so we have to disable input zero points fusing as well. - if (parent1->getType() != Type::Input || !parent1->isConstant() || parent1->getOriginalOutputPrecisionAtPort(0) != ov::element::i8) { + // In case weights are in FP32 (or we have zero points on weights which are not supported by INT8 convolution) + // we cannot use INT8 implementation so we have to disable input zero points fusing as well. + if (parent1->getType() != Type::Input || !parent1->isConstant() || + parent1->getOriginalOutputPrecisionAtPort(0) != ov::element::i8) { return false; } @@ -1085,7 +849,7 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { if (zeroPointsData == nullptr) OPENVINO_THROW("zeroPointsBlob has not allocated buffer"); - auto zeroPointDataSize = parent0->getInputShapeAtPort(1).getDims()[1]; + auto zeroPointDataSize = parent0->getInputShapeAtPort(1).getDims()[1]; if (Shape::UNDEFINED_DIM == zeroPointDataSize) { return false; } @@ -1121,8 +885,10 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { auto OC = weightsConstantDims[0 + groupOffset]; auto IC = weightsConstantDims[1 + groupOffset]; - auto KD = weightsConstantDims.size() == (5 + groupOffset) ? weightsConstantDims[weightsConstantDims.size() - 3] : 1; - auto KH = weightsConstantDims.size() == (3 + groupOffset) ? 1 : weightsConstantDims[weightsConstantDims.size() - 2]; + auto KD = + weightsConstantDims.size() == (5 + groupOffset) ? weightsConstantDims[weightsConstantDims.size() - 3] : 1; + auto KH = + weightsConstantDims.size() == (3 + groupOffset) ? 1 : weightsConstantDims[weightsConstantDims.size() - 2]; auto KW = weightsConstantDims[weightsConstantDims.size() - 1]; for (size_t g = 0; g < G; g++) { @@ -1132,20 +898,19 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { for (size_t kd = 0; kd < KD; kd++) { for (size_t kh = 0; kh < KH; kh++) { for (size_t kw = 0; kw < KW; kw++) { - size_t widx = g * OC * IC * KD * KH * KW + - oc * IC * KD * KH * KW + - ic * KD * KH * KW + - kd * KH * KW + - kh * KW + - kw; + size_t widx = g * OC * IC * KD * KH * KW + oc * IC * KD * KH * KW + ic * KD * KH * KW + + kd * KH * KW + kh * KW + kw; auto w = static_cast(weightsPtr[widx]); - auto izp = !convNode->legacyInputZeroPoints.empty() ? static_cast(convNode->legacyInputZeroPoints[g * IC + ic]) : 0; + auto izp = !convNode->legacyInputZeroPoints.empty() + ? static_cast(convNode->legacyInputZeroPoints[g * IC + ic]) + : 0; a += w * izp; - auto wzp = !convNode->legacyWeightsZeroPoints.empty() ? - static_cast(convNode->legacyWeightsZeroPoints[g * OC + oc]) : 0; + auto wzp = !convNode->legacyWeightsZeroPoints.empty() + ? static_cast(convNode->legacyWeightsZeroPoints[g * OC + oc]) + : 0; a -= wzp * izp; } } @@ -1158,7 +923,8 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto conv = graphNodes[i]; - if (!isSuitableConvNode(conv)) continue; + if (!isSuitableConvNode(conv)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvolutionAndZeroPoints_ConvNode); @@ -1166,8 +932,10 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { auto weightsEltwise = conv->getParentEdgeAt(1)->getParent(); if (initializeInputZeroPoints(conv, dataEltwise, weightsEltwise)) { auto p_edge = dataEltwise->getParentEdgeAt(1); - DEBUG_LOG("[GraphOptimizer##FusingZeorPoint]:Eltwise Subtract Node ##", dataEltwise->getName(), - " is optimized as zeropoint of Conv ##", conv->getName()); + DEBUG_LOG("[GraphOptimizer##FusingZeorPoint]:Eltwise Subtract Node ##", + dataEltwise->getName(), + " is optimized as zeropoint of Conv ##", + conv->getName()); graph.RemoveEdge(p_edge); graph.DropNode(dataEltwise); initializeOutputCompensation(conv); @@ -1175,7 +943,7 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { } } -void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1202,7 +970,7 @@ void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::FullyConnected) continue; @@ -1215,7 +983,7 @@ void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](const NodePtr& node) { @@ -1242,7 +1010,7 @@ void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::MatMul) continue; @@ -1255,14 +1023,14 @@ void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph& graph) { auto& graphNodes = graph.GetNodes(); - auto isConvolutionNode = [](const NodePtr &node) { + auto isConvolutionNode = [](const NodePtr& node) { return node->getType() == Type::Convolution; }; - auto is1x1Convolution = [](const std::shared_ptr &conv) { + auto is1x1Convolution = [](const std::shared_ptr& conv) { const auto weightRank = conv->getWeightDims().size(); return conv->getWeightDims()[weightRank - 1] == 1 && conv->getWeightDims()[weightRank - 2] == 1; }; @@ -1281,10 +1049,10 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { if (!conv->legacyWeightsZeroPoints.empty()) return false; - const auto &strides = conv->getStride(); - const auto &paddings = conv->getPaddingL(); - const auto &inDims = node->getInputShapeAtPort(0).getDims(); - const auto &outDims = node->getOutputShapeAtPort(0).getDims(); + const auto& strides = conv->getStride(); + const auto& paddings = conv->getPaddingL(); + const auto& inDims = node->getInputShapeAtPort(0).getDims(); + const auto& outDims = node->getOutputShapeAtPort(0).getDims(); bool isSupportedParams = conv->getGroupNum() == 1 && inDims.size() == 4 && dimsEqualStrong(inDims[inDims.size() - 1], outDims[outDims.size() - 1]) && @@ -1297,12 +1065,13 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { static_cast(paddings[paddings.size() - 1]), static_cast(paddings[paddings.size() - 2])) && !conv->canBeExecutedInInt8(); - if (!isSupportedParams) return false; + if (!isSupportedParams) + return false; return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild()); }; - auto isSuitableChildConvolution = [&](const NodePtr &parentNode, const NodePtr &childNode) { + auto isSuitableChildConvolution = [&](const NodePtr& parentNode, const NodePtr& childNode) { if (parentNode->isDropped() || childNode->isDropped()) return false; @@ -1317,15 +1086,19 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { if (convParent == nullptr) OPENVINO_THROW("Cannot cast to convolution node ", parentNode->getName()); - if (!everyone_is(ov::element::f32, convParent->getOriginalOutputPrecisionAtPort(0), convChild->getOriginalInputPrecisionAtPort(0), - convChild->getOriginalOutputPrecisionAtPort(0))) + if (!everyone_is(ov::element::f32, + convParent->getOriginalOutputPrecisionAtPort(0), + convChild->getOriginalInputPrecisionAtPort(0), + convChild->getOriginalOutputPrecisionAtPort(0))) return false; - auto parentOutputPrecision = !parentNode->fusedWith.empty() + auto parentOutputPrecision = + !parentNode->fusedWith.empty() ? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0) : parentNode->getOriginalOutputPrecisionAtPort(0); - auto childOutputPrecision = !childNode->fusedWith.empty() + auto childOutputPrecision = + !childNode->fusedWith.empty() ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0) : childNode->getOriginalOutputPrecisionAtPort(0); @@ -1361,7 +1134,7 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { return isSupportedParams; }; - auto isFusingWorthwhile = [&](const NodePtr &parentNode, const NodePtr &childNode) { + auto isFusingWorthwhile = [&](const NodePtr& parentNode, const NodePtr& childNode) { if (!childNode->inputShapes[0].isStatic() || !childNode->outputShapes[0].isStatic()) { return false; } @@ -1372,7 +1145,7 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { int L3_cache_size = dnnl::utils::get_cache_size(3, false); int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize; - int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize; + int dw_conv_output_size = outDims[0] * outDims[1] * outDims[2] * outDims[3] * elemSize; auto parentConvolutionNode = std::dynamic_pointer_cast(parentNode); if (parentConvolutionNode == nullptr) @@ -1385,19 +1158,23 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { }; for (size_t i = 0; i < graphNodes.size(); i++) { - if (!isConvolutionNode(graphNodes[i])) continue; + if (!isConvolutionNode(graphNodes[i])) + continue; auto parentConvNode = graphNodes[i]; - if (!isSuitableParentConvolution(parentConvNode)) continue; + if (!isSuitableParentConvolution(parentConvNode)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvolutionAndDWConvolution_ParentConv); auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild(); - if (!isSuitableChildConvolution(parentConvNode, childConvNode)) continue; + if (!isSuitableChildConvolution(parentConvNode, childConvNode)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvolutionAndDWConvolution_ChildConv); - if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue; + if (!isFusingWorthwhile(parentConvNode, childConvNode)) + continue; parentConvNode->addFusedNode(childConvNode); @@ -1411,12 +1188,12 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { } // TODO [NM]: unite with FuseConvolutionAndSimpleOperation -void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { - return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && node->getChildEdges().size() == 1 && - node->getOriginalOutputPrecisionAtPort(0) == ov::element::f32; + return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && + node->getChildEdges().size() == 1 && node->getOriginalOutputPrecisionAtPort(0) == ov::element::f32; }; auto parent = graphNodes.begin(); @@ -1455,7 +1232,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap parentNode->addFusedNode(fuseCandidate); parentNode->addOriginalLayer(fuseCandidate->getOriginalLayers()); auto parentEdges = fuseCandidate->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent() == childNode) continue; @@ -1466,11 +1243,12 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap } } -void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { - return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && node->getChildEdges().size() == 1; + return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && + node->getChildEdges().size() == 1; }; auto parent = graphNodes.begin(); @@ -1495,7 +1273,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == parentNodeType) continue; @@ -1508,7 +1286,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FusePoolingAndFakeQuantize(Graph &graph) { +void GraphOptimizer::FusePoolingAndFakeQuantize(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1526,12 +1304,14 @@ void GraphOptimizer::FusePoolingAndFakeQuantize(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto parent = graphNodes[i]; - if (!isSuitableParentNode(parent)) continue; + if (!isSuitableParentNode(parent)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePoolingAndFakeQuantize_ParentNode); auto child = parent->getChildEdgeAt(0)->getChild(); - if (!isSuitableChildNode(child)) continue; + if (!isSuitableChildNode(child)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePoolingAndFakeQuantize_ChildNode); @@ -1558,14 +1338,14 @@ void GraphOptimizer::FusePoolingAndFakeQuantize(Graph &graph) { * @param child node we try to find * @return True if child is one of data supplier */ -static bool is_data_dependency(const std::shared_ptr &parent, - const std::shared_ptr &child) { +static bool is_data_dependency(const std::shared_ptr& parent, const std::shared_ptr& child) { std::set visited; - std::list nextLayers {parent.get()}; + std::list nextLayers{parent.get()}; for (; !nextLayers.empty();) { auto layer = *nextLayers.begin(); - if (layer == child.get()) return true; + if (layer == child.get()) + return true; for (auto& oe : layer->getChildEdges()) { auto nn = oe.lock()->getChild(); if (visited.find(nn.get()) == visited.end()) { @@ -1616,19 +1396,18 @@ static bool is_data_dependency(const std::shared_ptr &parent, * *** */ -void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) { +void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph& graph) { #if !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64) return; #endif - auto &graphNodes = graph.GetNodes(); + auto& graphNodes = graph.GetNodes(); auto isFusingSupported = [&](NodePtr conv, NodePtr child) { - return child->getType() == Type::Eltwise && - DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm()); + return child->getType() == Type::Eltwise && DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm()); }; - for (auto &graphNode : graphNodes) { + for (auto& graphNode : graphNodes) { const auto eltwiseNode = std::dynamic_pointer_cast(graphNode); if (graphNode->getType() != Type::Eltwise || graphNode->getAlgorithm() != Algorithm::EltwiseAdd || !eltwiseNode || eltwiseNode->isWithBroadcast()) @@ -1642,12 +1421,12 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) auto parent1 = graphNode->getParentEdgeAt(0)->getParent(); auto parent2 = graphNode->getParentEdgeAt(1)->getParent(); - bool isSuitableParent1 = parent1->getType() == Type::Convolution - || parent1->getType() == Type::BinaryConvolution; - bool isSuitableParent2 = parent2->getType() == Type::Convolution - || parent2->getType() == Type::BinaryConvolution; + bool isSuitableParent1 = + parent1->getType() == Type::Convolution || parent1->getType() == Type::BinaryConvolution; + bool isSuitableParent2 = + parent2->getType() == Type::Convolution || parent2->getType() == Type::BinaryConvolution; - auto canFuseSum = [](node::BinaryConvolution *binConv, NodePtr fuseCandidate) { + auto canFuseSum = [](node::BinaryConvolution* binConv, NodePtr fuseCandidate) { if (binConv->getImplType() == impl_desc_type::ref) return false; @@ -1666,12 +1445,12 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) return false; }; - auto* binConvNode1 = dynamic_cast(parent1.get()); + auto* binConvNode1 = dynamic_cast(parent1.get()); if (binConvNode1) { isSuitableParent1 = isSuitableParent1 && canFuseSum(binConvNode1, graphNode); } - auto* binConvNode2 = dynamic_cast(parent2.get()); + auto* binConvNode2 = dynamic_cast(parent2.get()); if (binConvNode2) { isSuitableParent2 = isSuitableParent2 && canFuseSum(binConvNode2, graphNode); } @@ -1685,7 +1464,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) return false; }; - auto* convNode1 = dynamic_cast(parent1.get()); + auto* convNode1 = dynamic_cast(parent1.get()); if (convNode1) { if (!convNode1->canBeExecutedInInt8()) { isSuitableParent1 = isSuitableParent1 && convNode1->getFusedWith().empty(); @@ -1694,7 +1473,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) } } - auto* convNode2 = dynamic_cast(parent2.get()); + auto* convNode2 = dynamic_cast(parent2.get()); if (convNode2) { if (!convNode2->canBeExecutedInInt8()) { isSuitableParent2 = isSuitableParent2 && convNode2->getFusedWith().empty(); @@ -1713,9 +1492,9 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) // not merged operation (peerNode) has to be in low precision const auto isBranchQuantized = [](const NodePtr& branchParent) { const auto& fused = branchParent->getFusedWith(); - const auto branchPrecision = fused.empty() ? - branchParent->getOriginalOutputPrecisionAtPort(0) : - fused[fused.size() - 1]->getOriginalOutputPrecisionAtPort(0); + const auto branchPrecision = fused.empty() + ? branchParent->getOriginalOutputPrecisionAtPort(0) + : fused[fused.size() - 1]->getOriginalOutputPrecisionAtPort(0); return (branchPrecision == ov::element::i8) || (branchPrecision == ov::element::u8); }; @@ -1785,15 +1564,16 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) // be overwritten. Should verify that all other consumer already read it and // we can spoil input data. // TODO: rewrite once we add "Inplace" reporting mechanism - for (auto & edge : peerNode->getChildEdges()) { + for (auto& edge : peerNode->getChildEdges()) { if (!fuse_allowed) break; fuse_allowed &= is_data_dependency(edge.lock()->getChild(), sum); } - if (!fuse_allowed) continue; + if (!fuse_allowed) + continue; if (graphNode->getChildEdges().size() == 1 && - isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) { + isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) { auto relu_shared = graphNode->getChildEdgeAt(0)->getChild(); lastNode = relu_shared; if (mergedConv->isConstant() && !lastNode->isConstant()) @@ -1803,8 +1583,8 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) lastNode->fuseInto(mergedConv); - if (mergedConv->fusedWith.size() > 0 && - (mergedConv->fusedWith[0]->getType() == Type::Convolution || mergedConv->fusedWith[0]->getType() == Type::BinaryConvolution)) { + if (mergedConv->fusedWith.size() > 0 && (mergedConv->fusedWith[0]->getType() == Type::Convolution || + mergedConv->fusedWith[0]->getType() == Type::BinaryConvolution)) { // Merged with DW_conv. Shape may change mergedConv->inputShapes.push_back(mergedConv->fusedWith[0]->getOutputShapeAtPort(0)); } else { @@ -1835,7 +1615,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) graph.CreateEdge(peerNode, mergedConv, peer_port, childPort); std::vector edges_to_reconnect = lastNode->getChildEdges(); - for (auto &edge_w : edges_to_reconnect) { + for (auto& edge_w : edges_to_reconnect) { auto edge = edge_w.lock(); auto child = edge->getChild(); int idxParent = edge->getInputNum(); @@ -1855,7 +1635,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) } } -void GraphOptimizer::FuseMVNAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseMVNAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1882,7 +1662,7 @@ void GraphOptimizer::FuseMVNAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::MVN) continue; @@ -1895,7 +1675,7 @@ void GraphOptimizer::FuseMVNAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1904,8 +1684,8 @@ void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { auto isSuitableChildNode = [&](NodePtr parentNode, NodePtr childNode) { // Avoid cycle dependencies - for (auto &childParentEdge : childNode->getParentEdges()) { - for (auto &parentParentEdge : parentNode->getParentEdges()) { + for (auto& childParentEdge : childNode->getParentEdges()) { + for (auto& parentParentEdge : parentNode->getParentEdges()) { if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent()) return false; } @@ -1941,7 +1721,7 @@ void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::Interpolate) continue; @@ -1954,7 +1734,7 @@ void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1981,7 +1761,7 @@ void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::NormalizeL2) continue; @@ -1994,7 +1774,7 @@ void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseReduceAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseReduceAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -2021,7 +1801,7 @@ void GraphOptimizer::FuseReduceAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge == nullptr) OPENVINO_THROW("Cannot get parent edge ", childNode->getName()); @@ -2036,7 +1816,7 @@ void GraphOptimizer::FuseReduceAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { +void GraphOptimizer::FuseEltwiseAndSimple(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -2046,14 +1826,14 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { auto isSuitableChildNode = [&](NodePtr parentNode, NodePtr childNode) { if (parentNode->isConstant() && !childNode->isConstant()) return false; - for (auto &childParentEdge : childNode->getParentEdges()) { + for (auto& childParentEdge : childNode->getParentEdges()) { // WA to prevent unsupported reorder exception issue in some cases if (childParentEdge.lock()->getParent()->getType() == Type::Split) { return false; } // Avoid cycle dependencies - for (auto &parentParentEdge : parentNode->getParentEdges()) { + for (auto& parentParentEdge : parentNode->getParentEdges()) { if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent()) return false; } @@ -2077,7 +1857,8 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { auto childNode = parentNode->getChildEdgeAt(0)->getChild(); - if ((parentNode->isDynamicNode() && !childNode->isDynamicNode()) || (!parentNode->isDynamicNode() && childNode->isDynamicNode())) { + if ((parentNode->isDynamicNode() && !childNode->isDynamicNode()) || + (!parentNode->isDynamicNode() && childNode->isDynamicNode())) { parent++; continue; } @@ -2093,7 +1874,7 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { if (childNode->getType() == Type::FakeQuantize) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::Eltwise) continue; @@ -2109,9 +1890,11 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (parent == parentNode) { for (size_t j = 0; j < children.size(); j++) { @@ -2121,7 +1904,7 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { if (!child) continue; - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -2137,7 +1920,7 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { graph.CreateEdge(parent, child, inNum, outNum); } } else { - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; int outNum = parentNode->getParentEdges().size(); if (remEdge) { @@ -2228,15 +2011,14 @@ void GraphOptimizer::ShareReorders(Graph& graph) { } } -void GraphOptimizer::DropDoubleReorders(Graph &graph) { +void GraphOptimizer::DropDoubleReorders(Graph& graph) { std::set processed; auto& nodes = graph.GetNodes(); for (size_t i = 0; i < nodes.size(); i++) { auto node = nodes[i]; - if (processed.find(node) == processed.end() && node->getType() == Type::Reorder - && node->getChildEdges().size() == 1 - && node->getChildEdgeAt(0)->getChild()->getType() == Type::Reorder ) { + if (processed.find(node) == processed.end() && node->getType() == Type::Reorder && + node->getChildEdges().size() == 1 && node->getChildEdgeAt(0)->getChild()->getType() == Type::Reorder) { auto nextNode = node->getChildEdgeAt(0)->getChild(); Reorder* n = dynamic_cast(node.get()); if (n == nullptr) @@ -2261,7 +2043,8 @@ void GraphOptimizer::DropDoubleReorders(Graph &graph) { if (cur->getChild() == c) edge = cur; } - if (!edge) OPENVINO_THROW("Inappropriate graph processing"); + if (!edge) + OPENVINO_THROW("Inappropriate graph processing"); std::string layerName = edge->getParent()->getName() + "_ScaleReorder_" + edge->getChild()->getName(); graph.InsertReorder(edge, layerName, n->getInput(), nn->getOutput(), false); @@ -2270,11 +2053,12 @@ void GraphOptimizer::DropDoubleReorders(Graph &graph) { } } -void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { +void GraphOptimizer::FuseClampAndFakeQuantize(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableClampNode = [](NodePtr node) { - return node->getType() == Type::Eltwise && node->getChildEdges().size() == 1 && node->getAlgorithm() == Algorithm::EltwiseClamp; + return node->getType() == Type::Eltwise && node->getChildEdges().size() == 1 && + node->getAlgorithm() == Algorithm::EltwiseClamp; }; auto isSuitableFakeQuantizeNode = [](NodePtr node) { @@ -2282,7 +2066,7 @@ void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { }; auto fuseClampAndFakeQuantizeNodes = [](NodePtr parent, NodePtr child) { - auto* eltwiseNode = dynamic_cast(parent.get()); + auto* eltwiseNode = dynamic_cast(parent.get()); if (eltwiseNode == nullptr) OPENVINO_THROW("Cannot cast ", parent->getName(), " to Eltwise node"); @@ -2308,12 +2092,14 @@ void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto parent = graphNodes[i]; - if (!isSuitableClampNode(parent)) continue; + if (!isSuitableClampNode(parent)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseClampAndFakeQuantize_ClalmpNode); auto child = parent->getChildEdgeAt(0)->getChild(); - if (!isSuitableFakeQuantizeNode(child)) continue; + if (!isSuitableFakeQuantizeNode(child)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseClampAndFakeQuantize_QuantizeNode); @@ -2323,7 +2109,7 @@ void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { } } -void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { +void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto getNonConstPort = [](const NodePtr& node) { @@ -2341,11 +2127,12 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { }; auto isSuitableScaleShiftNode = [getNonConstPort](const NodePtr& node) { - if (!one_of(node->getAlgorithm(), Algorithm::EltwiseAdd, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseDivide, - Algorithm::EltwiseMulAdd)) + if (!one_of(node->getAlgorithm(), + Algorithm::EltwiseAdd, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseDivide, + Algorithm::EltwiseMulAdd)) return false; const auto nonConstPort = getNonConstPort(node); @@ -2375,7 +2162,7 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { const NodePtr eltwiseInput = parentEltwise->getParentEdgeAt(getNonConstPort(parent))->getParent(); std::tie(scalesBuffer, shiftsBuffer) = parentEltwise->getScalesAndShifts(eltwiseInput.get()); - const auto &outputShape = child->getOutputShapeAtPort(0); + const auto& outputShape = child->getOutputShapeAtPort(0); VectorDims outputDims = outputShape.getDims(); // We need to compute explicitly port with unfolded parent, @@ -2436,7 +2223,7 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { std::vector zeroShift(newInputScale.size(), 0.f); const auto isSubnormal = [](const float value) { - const uint32_t *u32data = reinterpret_cast(&value); + const uint32_t* u32data = reinterpret_cast(&value); return (*u32data) && (((*u32data) & (0xFF << 23)) == 0); }; @@ -2478,18 +2265,20 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto parent = graphNodes[i]; - if (!isSuitableScaleShiftNode(parent)) continue; + if (!isSuitableScaleShiftNode(parent)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePerformedAsScaleShiftAndFakeQuantize_ShiftNode); auto child = parent->getChildEdgeAt(0)->getChild(); - if (!isSuitableFakeQuantizeNode(child)) continue; + if (!isSuitableFakeQuantizeNode(child)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePerformedAsScaleShiftAndFakeQuantize_QuantizeNode); if (fuseScaleShiftAndFakeQuantizeNodes(parent, child)) { auto parentEdges = parent->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (!p_edge->getParent()->isConstant()) continue; @@ -2613,7 +2402,12 @@ void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph, transposeNode->getName(), " is not a transpose node"); - const auto& inOrder = transposeNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as()->getOrder(); + const auto& inOrder = transposeNode->getSelectedPrimitiveDescriptor() + ->getConfig() + .inConfs[0] + .getMemDesc() + ->as() + ->getOrder(); const auto& outOrder = reorderOutDesc->as()->getOrder(); // Permutation must be set and reorder mustn't be optimized in 2 cases: // 1. Transpose has blocked input & non-blocked output @@ -2629,11 +2423,13 @@ void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph, } } - std::string reorderName = nodeBeforeSequence->getName() + "_" + Reorder::getReorderArgs(*reorderInDesc, *reorderOutDesc); + std::string reorderName = + nodeBeforeSequence->getName() + "_" + Reorder::getReorderArgs(*reorderInDesc, *reorderOutDesc); if (isOptimized) - reorderName += "_fake"; + reorderName += "_fake"; DEBUG_LOG("mergeTransposeAndReorder ", parentNode->getName(), " and ", childNode->getName(), " -> ", reorderName); - auto reorder_layout = std::make_shared(*reorderInDesc, *reorderOutDesc, reorderName, graph.getGraphContext()); + auto reorder_layout = + std::make_shared(*reorderInDesc, *reorderOutDesc, reorderName, graph.getGraphContext()); reorder_layout->setOptimized(isOptimized); reorder_layout->setSrcPermutation(srcPerm); @@ -2646,10 +2442,8 @@ void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph, Reorder::getReorderArgs(*reorderOutDesc, *finalDesc) + "_" + nodeAfterSequence->getName(); - reorder_last = std::make_shared(*reorderOutDesc, - *finalDesc, - reorderLayerName2, - graph.getGraphContext()); + reorder_last = + std::make_shared(*reorderOutDesc, *finalDesc, reorderLayerName2, graph.getGraphContext()); reorder_last->setOptimized(false); reorder_last->setSrcPermutation(srcPerm); graph.CreateEdge(reorder_layout, reorder_last, 0, 0); @@ -2703,10 +2497,10 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { return false; }; - return node->getType() == Type::Transpose - && node->getChildEdges().size() == 1 - && !node->isDynamicNode() // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is available (CVS-74863) - && !prevNodeIsConvSum(node); + return node->getType() == Type::Transpose && node->getChildEdges().size() == 1 && + !node->isDynamicNode() // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is + // available (CVS-74863) + && !prevNodeIsConvSum(node); }; auto isSuitableReshape = [](NodePtr node) { @@ -2731,8 +2525,9 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { }; auto isSuitableReorder = [](NodePtr node) { - return node->getType() == Type::Reorder - && !node->isDynamicNode(); // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is available (CVS-74863) + return node->getType() == Type::Reorder && + !node->isDynamicNode(); // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is + // available (CVS-74863) }; auto updateOrder = [](const VectorDims& originalOrder, NodePtr reshape) { @@ -2800,17 +2595,28 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { const auto transposeNode = std::dynamic_pointer_cast(parentNode); const auto reorderNode = std::dynamic_pointer_cast(childNode); - std::shared_ptr reshapeNode = intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; + std::shared_ptr reshapeNode = + intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; if (!transposeNode || !reorderNode || (intermNode && !reshapeNode)) { continue; } auto transposeOrder = updateOrder(transposeNode->getOrder(), reshapeNode); - auto descBeforeReorder = reorderNode->getParentEdgeAt(0)->getParent()->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc(); + auto descBeforeReorder = reorderNode->getParentEdgeAt(0) + ->getParent() + ->getSelectedPrimitiveDescriptor() + ->getConfig() + .outConfs[0] + .getMemDesc(); auto layoutOrder = descBeforeReorder->as()->getOrder(); - auto inBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); - auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc()->as(); + auto inBlockedDesc = + reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); + auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor() + ->getConfig() + .outConfs[0] + .getMemDesc() + ->as(); auto& inOrder = inBlockedDesc->getOrder(); auto& outOrder = outBlockedDesc->getOrder(); @@ -2821,13 +2627,11 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { } } -void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { +void GraphOptimizer::MergeReorderAndTranspose(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableTranspose = [](NodePtr node) { - return node->getType() == Type::Transpose - && node->getChildEdges().size() == 1 - && !node->isDynamicNode(); + return node->getType() == Type::Transpose && node->getChildEdges().size() == 1 && !node->isDynamicNode(); }; auto isSuitableReshape = [](NodePtr node) { @@ -2917,7 +2721,8 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { auto transposeNode = std::dynamic_pointer_cast(childNode); auto reorderNode = std::dynamic_pointer_cast(parentNode); - std::shared_ptr reshapeNode = intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; + std::shared_ptr reshapeNode = + intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; if (!transposeNode || !reorderNode || (intermNode && !reshapeNode)) { continue; } @@ -2926,15 +2731,20 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { auto descAfterTranspose = transposeNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc(); auto layoutOrder = updateOrder(descAfterTranspose->as()->getOrder(), reshapeNode); - auto inBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); - auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc()->as(); + auto inBlockedDesc = + reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); + auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor() + ->getConfig() + .outConfs[0] + .getMemDesc() + ->as(); auto& inOrder = inBlockedDesc->getOrder(); auto& outOrder = outBlockedDesc->getOrder(); if (checkAscendingFinalOrder(transposeOrder, layoutOrder, inOrder, outOrder)) { - // Reorder node doesn't support (with rare exceptions) reordering in case of different ranks on input and output. - // So the merge can be performed only in the case when the fused reorder will be optimized. + // Reorder node doesn't support (with rare exceptions) reordering in case of different ranks on input and + // output. So the merge can be performed only in the case when the fused reorder will be optimized. if (parentNode->getInputShapeAtPort(0).getRank() != childNode->getOutputShapeAtPort(0).getRank() && !canBeInplaced(parentNode, childNode)) { continue; @@ -2944,14 +2754,15 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { } } -void GraphOptimizer::reshapeRnnSeq(Graph &graph) { +void GraphOptimizer::reshapeRnnSeq(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { if (node->type != Type::RNNSeq) return false; auto rnnNode = std::dynamic_pointer_cast(node); - return rnnNode && !rnnNode->hasNativeOrder() && node->outputShapes[0].getRank() == 4 && node->outputShapes[0].getDims()[1] == 1; + return rnnNode && !rnnNode->hasNativeOrder() && node->outputShapes[0].getRank() == 4 && + node->outputShapes[0].getDims()[1] == 1; }; for (size_t i = 0; i < graphNodes.size(); i++) { @@ -2973,10 +2784,12 @@ void GraphOptimizer::reshapeRnnSeq(Graph &graph) { auto edge = childrenEdges[j]; auto childNode = edge->getChild(); - const auto secondInput = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); + const auto secondInput = + std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); const auto unsqueeze = std::make_shared( std::make_shared(parentNode->getOriginalOutputPrecisionAtPort(0), - parentNode->getOutputShapeAtPort(0).toPartialShape()), secondInput); + parentNode->getOutputShapeAtPort(0).toPartialShape()), + secondInput); unsqueeze->set_friendly_name(parentNode->getName() + "_abc_a1bc_" + std::to_string(j)); const auto cpuUnsqueeze = std::make_shared(unsqueeze, graph.getGraphContext()); @@ -3016,7 +2829,7 @@ void GraphOptimizer::RemoveSameConvert(Graph& graph) { } } -void GraphOptimizer::RemoveMemoryInputConvert(Graph &graph) { +void GraphOptimizer::RemoveMemoryInputConvert(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableNode = [](const NodePtr& node) { @@ -3042,7 +2855,7 @@ void GraphOptimizer::RemoveMemoryInputConvert(Graph &graph) { } } -void GraphOptimizer::RemoveConvertMemoryOutput(Graph &graph) { +void GraphOptimizer::RemoveConvertMemoryOutput(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableNode = [](const NodePtr& node) { @@ -3070,7 +2883,7 @@ void GraphOptimizer::RemoveConvertMemoryOutput(Graph &graph) { } } -void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { +void GraphOptimizer::MatchSdpaKvCache(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableMemInput = [](const NodePtr& node) -> bool { @@ -3087,7 +2900,7 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { if (Type::ScaledDotProductAttention == childNode->getType()) { if (childSdpa && childSdpa != childNode) { - //only one child SDPA supported + // only one child SDPA supported return false; } childSdpa = childNode; @@ -3130,7 +2943,7 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { input_prc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); } - //search for SDPA + // search for SDPA std::shared_ptr sdpa; for (auto&& edge : node->getChildEdgesAtPort(0)) { auto child = edge->getChild(); @@ -3144,19 +2957,18 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { } } - //capture reference to the original mem output before graph transformations + // capture reference to the original mem output before graph transformations auto& memOutput = memInputNode->getOutputNode(); - auto memInputSdpa = std::make_shared( - memInputNode->getId(), - memInputNode->getName(), - memInputNode->getTypeStr(), - memInputNode->getOutputShapeAtPort(0), - memInputNode->getOriginalOutputPrecisionAtPort(0), - graph.getGraphContext(), - input_shape, - input_prc, - sdpa); + auto memInputSdpa = std::make_shared(memInputNode->getId(), + memInputNode->getName(), + memInputNode->getTypeStr(), + memInputNode->getOutputShapeAtPort(0), + memInputNode->getOriginalOutputPrecisionAtPort(0), + graph.getGraphContext(), + input_shape, + input_prc, + sdpa); if (!memInputNode->getParentEdges().empty()) { auto parentEdge = memInputNode->getParentEdgeAt(0); @@ -3173,14 +2985,13 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { graph.CreateEdge(memInputSdpa, child, 0, outputNum); } - //create a stub memory output - auto memOutputStub = std::make_shared( - memOutput.getId(), - memOutput.getName(), - memOutput.getTypeStr(), - memOutput.getInputShapeAtPort(0), - memOutput.getOriginalInputPrecisionAtPort(0), - graph.getGraphContext()); + // create a stub memory output + auto memOutputStub = std::make_shared(memOutput.getId(), + memOutput.getName(), + memOutput.getTypeStr(), + memOutput.getInputShapeAtPort(0), + memOutput.getOriginalInputPrecisionAtPort(0), + graph.getGraphContext()); auto memOutputEdge = memOutput.getParentEdgeAt(0); const auto inputNum = memOutputEdge->getInputNum(); @@ -3192,7 +3003,7 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { } } -void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { +void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) { // When we have a MemoryInput->MemoryOutput pair, that means that the state is immediately populated with the init // subgraph values when the init subgraph exists. In all the other cases the state is simply a read only object. // We can optimize such a case removing the MemoryOutput node and transferring the state values update @@ -3233,7 +3044,7 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { } if (MemoryOutput && MemoryOutput != childNode) { - //only one child MemoryOutput is expected + // only one child MemoryOutput is expected return false; } MemoryOutput = childNode; @@ -3261,7 +3072,7 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { inputPrc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); } - //search for the MemoryOutputNode + // search for the MemoryOutputNode NodePtr memoryOutputNode; for (auto&& edge : node->getChildEdgesAtPort(0)) { auto child = edge->getChild(); @@ -3304,5 +3115,5 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index 886296a7c0053b..90cf9c41c0907e 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -16,43 +16,42 @@ class GraphOptimizer { public: void ApplyCommonGraphOptimizations(Graph& graph); void ApplyImplSpecificGraphOptimizations(Graph& graph); - void ShareReorders(Graph &graph); + void ShareReorders(Graph& graph); private: - void FuseConvMatmulFCDeconvAndDQScales(Graph &graph); - void FuseFCAndWeightsDecompression(Graph &graph); - void FuseConvolutionMatMulDeconvAndBias(Graph &graph); - void FuseDeconvolutionAndSimpleOperation(Graph &graph); - void FuseMultiplyAndAdd(Graph &graph); + void FuseConvMatmulFCDeconvAndDQScales(Graph& graph); + void FuseConvolutionMatMulDeconvAndBias(Graph& graph); + void FuseDeconvolutionAndSimpleOperation(Graph& graph); + void FuseMultiplyAndAdd(Graph& graph); void MergeConvertAndScaleShift(Graph& graph); void FuseFCAndConvertOnWeights(Graph& graph); void FuseFCAndTransposeOnWeights(Graph& graph); - void FuseFullyConnectedAndSimpleOperation(Graph &graph); - void FuseMatMulAndSimpleOperation(Graph &graph); - void FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &graph); - void FuseConvolutionAndSimpleOperation(Graph &graph); - void FuseConvolutionAndDWConvolution(Graph &graph); - void FusePoolingAndFakeQuantize(Graph &graph); - void FuseConvolutionSumAndConvolutionSumActivation(Graph &graph); - void FuseMVNAndSimpleOperation(Graph &graph); - void FuseInterpolateAndSimpleOperation(Graph &graph); - void FuseNormalizeL2AndSimpleOperation(Graph &graph); - void FuseReduceAndSimpleOperation(Graph &graph); + void FuseFullyConnectedAndSimpleOperation(Graph& graph); + void FuseMatMulAndSimpleOperation(Graph& graph); + void FuseConvolutionAndSimpleOperationThroughMaxPool(Graph& graph); + void FuseConvolutionAndSimpleOperation(Graph& graph); + void FuseConvolutionAndDWConvolution(Graph& graph); + void FusePoolingAndFakeQuantize(Graph& graph); + void FuseConvolutionSumAndConvolutionSumActivation(Graph& graph); + void FuseMVNAndSimpleOperation(Graph& graph); + void FuseInterpolateAndSimpleOperation(Graph& graph); + void FuseNormalizeL2AndSimpleOperation(Graph& graph); + void FuseReduceAndSimpleOperation(Graph& graph); void DropDoubleReorders(Graph& graph); - void FuseConvolutionAndZeroPoints(Graph &graph); - void FuseBroadcastAndEltwise(Graph &graph); - void FuseEltwiseAndSimple(Graph &graph); - void FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph); - void FuseClampAndFakeQuantize(Graph &graph); - void MergeTransposeAndReorder(Graph &graph); - void MergeReorderAndTranspose(Graph &graph); - void reshapeRnnSeq(Graph &graph); - void RemoveSameConvert(Graph &graph); - void RemoveMemoryInputConvert(Graph &graph); - void RemoveConvertMemoryOutput(Graph &graph); - void MatchSdpaKvCache(Graph &graph); - void DropRedundantMemoryOutput(Graph &graph); + void FuseConvolutionAndZeroPoints(Graph& graph); + void FuseBroadcastAndEltwise(Graph& graph); + void FuseEltwiseAndSimple(Graph& graph); + void FusePerformedAsScaleShiftAndFakeQuantize(Graph& graph); + void FuseClampAndFakeQuantize(Graph& graph); + void MergeTransposeAndReorder(Graph& graph); + void MergeReorderAndTranspose(Graph& graph); + void reshapeRnnSeq(Graph& graph); + void RemoveSameConvert(Graph& graph); + void RemoveMemoryInputConvert(Graph& graph); + void RemoveConvertMemoryOutput(Graph& graph); + void MatchSdpaKvCache(Graph& graph); + void DropRedundantMemoryOutput(Graph& graph); bool canBeInplaced(const NodePtr& parentNode, const NodePtr& childNode); // Method checks that after the sequential execution of Transpose and Reorder nodes, @@ -69,19 +68,22 @@ class GraphOptimizer { // Examples: // 1. Direct order, no Reshape node. // Before: [N,C,H,W]abcd==>Transpose(0312)==>[N,W,C,H]abcd==>Reorder(abcd->acdb)==>[N,W,C,H]acdb - // [N,C,H,W]abcd is equivalent to the [N,W,C,H]acdb, so the Transpose and Reorder can be fused into single optimized Reorder: - // After: [N,C,H,W]abcd==>Reorder(abcd->acdb, isOptimized=true)==>[N,W,C,H]acdb + // [N,C,H,W]abcd is equivalent to the [N,W,C,H]acdb, so the Transpose and Reorder can be fused into single + // optimized Reorder: After: [N,C,H,W]abcd==>Reorder(abcd->acdb, isOptimized=true)==>[N,W,C,H]acdb // 2. Reverse order, no Reshape node. // Before: [N,W,C,H]acdb==>Reorder(acdb->abcd)==>[N,W,C,H]abcd==>Transpose(0231)==>[N,C,H,W]abcd - // [N,W,C,H]acdb is equivalent to the [N,C,H,W]abcd, so the Transpose and Reorder can be fused into single optimized Reorder: - // After: [N,W,C,H]acdb==>Reorder(acdb->abcd, isOptimized=true)==>[N,C,H,W]abcd + // [N,W,C,H]acdb is equivalent to the [N,C,H,W]abcd, so the Transpose and Reorder can be fused into single + // optimized Reorder: After: [N,W,C,H]acdb==>Reorder(acdb->abcd, isOptimized=true)==>[N,C,H,W]abcd // 3. Direct order with Reshape node (L = H x w). - // Before: [N,L,C]abc==>Transpose(021)==>[N,C,L]abc==>Reshape==>[N,C,H,W]abcd==>Reoder(abcd->acdb)==>[N,C,H,W]acdb - // After: [N,L,C]abc==>Reorder(abc->acdb, isOptimized=true)==>[N,C,H,W]acdb + // Before: + // [N,L,C]abc==>Transpose(021)==>[N,C,L]abc==>Reshape==>[N,C,H,W]abcd==>Reoder(abcd->acdb)==>[N,C,H,W]acdb After: + // [N,L,C]abc==>Reorder(abc->acdb, isOptimized=true)==>[N,C,H,W]acdb // 4. Reverse order with Reshape node (L = H x W). - // Before: [N,C,H,W]acdb==>Reorder(acdb->abcd)==>[N,C,H,W]abcd==>Reshape==>[N,C,L]abc==>Transpose(021)==>[N,L,C]abc + // Before: + // [N,C,H,W]acdb==>Reorder(acdb->abcd)==>[N,C,H,W]abcd==>Reshape==>[N,C,L]abc==>Transpose(021)==>[N,L,C]abc // After: [N,C,H,W]acdb==>Reorder(acdb->abc, isOptimized=true)==>[N,L,C]abc - // Note: in some cases (inplace conflicts or transpose with blocked input and non-blocked output) the merged Reorder can not be optimized. + // Note: in some cases (inplace conflicts or transpose with blocked input and non-blocked output) the merged Reorder + // can not be optimized. void mergeTransposeReshapeReorder(Graph& graph, const NodePtr& transposeNode, const NodePtr& reshapeNode, @@ -89,5 +91,5 @@ class GraphOptimizer { const bool reverseOrder); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index 26cdaf0860168a..44b9904bde202a 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -5,28 +5,26 @@ #include "infer_request.h" #include "async_infer_request.h" -#include "compiled_model.h" #include "dnnl_extension_utils.h" #include "itt.h" -#include "memory_state.h" -#include "nodes/common/cpu_convert.h" #include "memory_desc/cpu_memory_desc_utils.h" +#include "nodes/common/cpu_convert.h" #include "nodes/memory_state_base.h" #include "openvino/core/shape.hpp" #include "openvino/runtime/make_tensor.hpp" #include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/threading/cpu_message.hpp" #include "proxy_mem_blk.h" #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" -#include "openvino/runtime/threading/cpu_message.hpp" using OvString = ov::element_type_traits::value_type; namespace ov { namespace intel_cpu { -SyncInferRequest::SyncInferRequest(std::shared_ptr compiled_model) - : ov::ISyncInferRequest(compiled_model), - m_compiled_model(compiled_model) { +SyncInferRequest::SyncInferRequest(CompiledModelHolder compiled_model) + : ov::ISyncInferRequest(compiled_model.compiled_model()), + m_compiled_model(std::move(compiled_model)) { const auto& inputs = get_inputs(); for (std::size_t input_index = 0; input_index < inputs.size(); input_index++) { m_input_ports_map[input_index] = inputs[input_index]; @@ -40,13 +38,8 @@ SyncInferRequest::SyncInferRequest(std::shared_ptr compiled } void SyncInferRequest::create_infer_request() { - auto id = (m_compiled_model->m_numRequests)++; - m_profiling_task = openvino::itt::handle("INTEL_CPU_INFER_" + m_compiled_model->m_name + "_" + std::to_string(id)); - - if (m_compiled_model->m_graphs.size() == 0) { - OPENVINO_THROW("No graph was found"); - } - m_graph = &(m_compiled_model->get_graph()._graph); + m_profiling_task = openvino::itt::handle("INTEL_CPU_INFER_" + m_compiled_model.name() + "_" + + std::to_string(m_compiled_model.id())); // Alocate memory for each tensor if static shape for (const auto& it : m_input_ports_map) { @@ -56,36 +49,17 @@ void SyncInferRequest::create_infer_request() { init_tensor(it.first, ov::ISyncInferRequest::FoundPort::Type::OUTPUT); } - //create states according to the list of the MemoryStateNodes - for (auto&& node : m_graph->getInternalStateNodes()) { - m_memory_states.emplace_back(node.second->makeState()); - } -} - -SyncInferRequest::~SyncInferRequest() { - --(m_compiled_model->m_numRequests); + // create states according to the list of the MemoryStateNodes + m_memory_states = m_compiled_model.graph().memoryStates(); } -// state -> storage -void SyncInferRequest::assign_states() { - auto&& graph_internal_state_nodes = m_graph->getInternalStateNodes(); - for (const auto& state : m_memory_states) { - auto itr = graph_internal_state_nodes.find(state->get_name()); - if (itr != graph_internal_state_nodes.end()) { - itr->second->assignState(state); - } - } -} - -void SyncInferRequest::redefine_memory_for_input_nodes() { - const auto cpuInputNodes = m_graph->GetInputNodesMap(); +void SyncInferRequest::redefine_memory_for_input_nodes(Graph& graph) { for (const auto& input_port : m_input_ports_map) { - const auto inputNode = cpuInputNodes.find(input_port.first); - if (inputNode == cpuInputNodes.end()) - OPENVINO_THROW("CPU execution graph doesn't contain input node with index: ", input_port.first); - if (inputNode->second->isDynamicNode()) { + auto inputNode = graph.getInputNodeByIndex(input_port.first); + OPENVINO_ASSERT(inputNode, "CPU execution graph doesn't contain output node with index: ", input_port.first); + if (inputNode->isDynamicNode()) { auto tensor = get_tensor(input_port.second); - inputNode->second->redefineOutputMemory({tensor->get_shape()}); + inputNode->redefineOutputMemory({tensor->get_shape()}); } } } @@ -103,8 +77,8 @@ void SyncInferRequest::update_external_tensor_ptrs() { void SyncInferRequest::infer() { using namespace openvino::itt; OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, m_profiling_task); - auto graphLock = m_compiled_model->get_graph(); - m_graph = &(graphLock._graph); + auto graphLock = m_compiled_model.lock(); + auto&& graph = graphLock._graph; auto message = ov::threading::message_manager(); throw_if_canceled(); @@ -120,40 +94,41 @@ void SyncInferRequest::infer() { update_external_tensor_ptrs(); } - if (m_graph->hasDynamicInput()) { - redefine_memory_for_input_nodes(); + if (graph.hasDynamicInput()) { + redefine_memory_for_input_nodes(graph); } - change_default_ptr(); + change_default_ptr(graph); throw_if_canceled(); // state -> node if (!m_memory_states.empty()) { - assign_states(); + graph.assignStates(m_memory_states); } - push_input_data(); + push_input_data(graph); - m_graph->Infer(this); + graph.Infer(this); throw_if_canceled(); // update output control blocks, if any, in order to refresh internal buffers - if (m_graph->IsDynamic()) { + if (graph.IsDynamic()) { for (auto&& item : m_outputControlBlocks) { item.second.update(); } } - m_graph->PullOutputData(m_outputs); + graph.PullOutputData(m_outputs); } std::vector SyncInferRequest::get_profiling_info() const { - if (!m_graph || !m_graph->IsReady()) + auto&& graph = m_compiled_model.graph(); + if (!graph.IsReady()) OPENVINO_THROW("Graph is not ready!"); std::vector perfMap; - m_graph->GetPerfData(perfMap); + graph.GetPerfData(perfMap); return perfMap; } @@ -162,7 +137,7 @@ static inline void change_edge_ptr(const EdgePtr& edge, ov::SoPtr& OPENVINO_ASSERT(mem != nullptr, "Edge with name '", *edge, "' doesn't have allocated memory object."); if (tensor->get_element_type() == element::string) { - auto memBlock = dynamic_cast(mem.get())->getStringMemoryBlockPtr(); + auto memBlock = dynamic_cast(mem.get())->getStringMemoryBlockPtr(); OPENVINO_ASSERT(memBlock); memBlock->setExtBuff(tensor->data(), tensor->get_size()); } else { @@ -172,27 +147,23 @@ static inline void change_edge_ptr(const EdgePtr& edge, ov::SoPtr& } } -void SyncInferRequest::change_default_ptr() { - const auto& inputNodesMap = m_graph->GetInputNodesMap(); - const auto& outputNodesMap = m_graph->GetOutputNodesMap(); - +void SyncInferRequest::change_default_ptr(Graph& graph) { std::unordered_set inputPtrs; - std::function& tensor)> changeInpPtr; - if (m_graph->IsDynamic()) { - changeInpPtr = [&inputPtrs](const EdgePtr &edge, ov::SoPtr& tensor) { + std::function& tensor)> changeInpPtr; + if (graph.IsDynamic()) { + changeInpPtr = [&inputPtrs](const EdgePtr& edge, ov::SoPtr& tensor) { change_edge_ptr(edge, tensor); inputPtrs.insert(tensor->data()); }; } else { - changeInpPtr = [](const EdgePtr &edge, ov::SoPtr& tensor) { + changeInpPtr = [](const EdgePtr& edge, ov::SoPtr& tensor) { change_edge_ptr(edge, tensor); }; } for (auto& it : m_input_external_ptr) { - auto input = inputNodesMap.find(it.first); - OPENVINO_ASSERT(inputNodesMap.end() != input, "Cannot find input tensor with index: ", it.first); - NodePtr inputNodePtr = input->second; + auto inputNodePtr = graph.getInputNodeByIndex(it.first); + OPENVINO_ASSERT(inputNodePtr, "Cannot find input tensor with index: ", it.first); if (inputNodePtr->getDstDataAtPort(0) == static_cast(it.second->data())) continue; auto& childEdges = inputNodePtr->getChildEdges(); @@ -238,9 +209,9 @@ void SyncInferRequest::change_default_ptr() { } for (auto& it : m_output_external_ptr) { - auto output = outputNodesMap.find(it.first); - OPENVINO_ASSERT(outputNodesMap.end() != output, "Cannot find output tensor with index: ", it.first); - auto parentEdge = output->second->getParentEdgeAt(0); + auto output = graph.getOutputNodeByIndex(it.first); + OPENVINO_ASSERT(output, "Cannot find output tensor with index: ", it.first); + auto parentEdge = output->getParentEdgeAt(0); void* const outputRawPtr = parentEdge->getMemory().getData(); if (outputRawPtr == static_cast(it.second->data())) continue; @@ -278,33 +249,42 @@ void SyncInferRequest::change_default_ptr() { change_edge_ptr(parentEdge, it.second); } - if (m_graph->IsDynamic()) { - const auto &outMemBlocksMap = m_graph->getOutputNodesMemBlocksMap(); + if (graph.IsDynamic()) { + const auto& outMemBlocksMap = graph.getOutputNodesMemBlocksMap(); for (auto&& item : outMemBlocksMap) { - const auto& name = item.first; + const auto index = item.first; // share intel_cpu::Tensor to Graph by injecting to corresponding ProxyMemoryBlock instance. auto outputMemBlock = item.second; - OPENVINO_ASSERT(outputMemBlock, "proxy mem block for output ", name, " is empty."); + OPENVINO_ASSERT(outputMemBlock, "proxy mem block for output ", index, " is empty."); - auto controlBlockItr = m_outputControlBlocks.find(name); + auto controlBlockItr = m_outputControlBlocks.find(index); if (controlBlockItr != m_outputControlBlocks.end()) { - auto output = outputNodesMap.find(name); - OPENVINO_ASSERT(outputNodesMap.end() != output, "Node with name: ", name, " is absent in the outputNodesMap"); - auto parentEdge = output->second->getParentEdgeAt(0); - //avoid cyclic memory use + auto output = graph.getOutputNodeByIndex(index); + OPENVINO_ASSERT(output, "Output with index: ", index, " is absent in the outputNodesMap"); + auto parentEdge = output->getParentEdgeAt(0); + // avoid cyclic memory use auto&& controlBlock = controlBlockItr->second; - std::shared_ptr memBlock = inputPtrs.count(controlBlock.rawPtr()) ? // same memory is used on the input and output - controlBlock.nextMemBlock() : // then swap internal buffer to avoid data corruption - controlBlock.currentMemBlock(); // else reuse the existing buffer + std::shared_ptr memBlock = + inputPtrs.count(controlBlock.rawPtr()) ? // same memory is used on the input and output + controlBlock.nextMemBlock() + : // then swap internal buffer to avoid data corruption + controlBlock.currentMemBlock(); // else reuse the existing buffer outputMemBlock->setMemBlockResize(memBlock); - DEBUG_LOG("reset proxy ", outputMemBlock, ", actual ", controlBlock.currentMemBlock(), " graph ", m_graph, " inferrequest ", this); - DEBUG_LOG(name, ", tensor ", controlBlock.tensor()); + DEBUG_LOG("reset proxy ", + outputMemBlock, + ", actual ", + controlBlock.currentMemBlock(), + " graph ", + &graph, + " infer request ", + this); + DEBUG_LOG(index, ", tensor ", controlBlock.tensor()); } else { - outputMemBlock->reset(); // switch to the internal memory since memory sharing is no longer possible + outputMemBlock->reset(); // switch to the internal memory since memory sharing is no longer possible } } } @@ -401,7 +381,12 @@ void SyncInferRequest::set_tensor(const ov::Output& in_port, con " are different."); } - MemoryDescPtr actualDesc = m_graph->getInputNodeByIndex(input_index)->getBaseMemDescAtOutputPort(0); + auto&& graph = m_compiled_model.graph(); + + auto inputNode = graph.getInputNodeByIndex(input_index); + OPENVINO_ASSERT(inputNode, "CPU execution graph doesn't contain input node with index: ", input_index); + + MemoryDescPtr actualDesc = inputNode->getBaseMemDescAtOutputPort(0); if (!actualDesc->isDefined()) { // we must define desc for dynamic case // otherwise we got incorrect check on shape compatibility inside isCompatible @@ -448,7 +433,11 @@ void SyncInferRequest::set_tensor(const ov::Output& in_port, con " are different."); } - const auto& desc = m_graph->getOutputNodeByIndex(output_index)->getParentEdgeAt(0)->getMemory().getDesc(); + auto&& graph = m_compiled_model.graph(); + + auto outputNode = graph.getOutputNodeByIndex(output_index); + OPENVINO_ASSERT(outputNode, "CPU execution graph doesn't contain output node with index: ", output_index); + const auto& desc = outputNode->getParentEdgeAt(0)->getMemory().getDesc(); if (!isDynamic && mem_desc_ptr->isCompatible(desc)) { m_output_external_ptr[output_index] = tensor; } else if (m_output_external_ptr.find(output_index) != m_output_external_ptr.end()) { @@ -456,12 +445,13 @@ void SyncInferRequest::set_tensor(const ov::Output& in_port, con } m_outputs[output_index] = tensor; - m_outputControlBlocks.erase(output_index); // now the memory is under user's control + m_outputControlBlocks.erase(output_index); // now the memory is under user's control } ov::ISyncInferRequest::set_tensor(port, tensor); } -void SyncInferRequest::set_tensors_impl(const ov::Output port, const std::vector>& tensors) { +void SyncInferRequest::set_tensors_impl(const ov::Output port, + const std::vector>& tensors) { if (find_port(port).is_input()) { m_batched_tensors[port.get_tensor_ptr()] = tensors; return; @@ -471,15 +461,15 @@ void SyncInferRequest::set_tensors_impl(const ov::Output port, c void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyncInferRequest::FoundPort::Type& type) { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "init_tensor"); - if (!m_graph || !m_graph->IsReady()) - OPENVINO_THROW("Graph is not ready!"); + auto&& graph = m_compiled_model.graph(); + OPENVINO_ASSERT(graph.IsReady(), "Graph is not ready!"); ov::SoPtr tensor; if (type == ov::ISyncInferRequest::FoundPort::Type::INPUT) { - OPENVINO_ASSERT(m_graph->GetInputNodesMap().find(port_index) != m_graph->GetInputNodesMap().end(), + OPENVINO_ASSERT(graph.getInputNodeByIndex(port_index), "Tensor with index: ", port_index, - " exists in CPU plugin graph, but absents in model inputs"); + " absent in the plugin's graph inputs"); const auto& port = m_input_ports_map[port_index]; tensor = ov::ISyncInferRequest::get_tensor(port); @@ -500,8 +490,9 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn if (!isDynamic) { auto mem_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(tensor); - if (mem_desc_ptr->isCompatible( - m_graph->getInputNodeByIndex(port_index)->getChildEdgeAt(0)->getMemory().getDesc())) { + auto inputNode = graph.getInputNodeByIndex(port_index); + OPENVINO_ASSERT(inputNode, "CPU execution graph doesn't contain input node with index: ", port_index); + if (mem_desc_ptr->isCompatible(inputNode->getChildEdgeAt(0)->getMemory().getDesc())) { m_input_external_ptr[port_index] = tensor; } } @@ -509,16 +500,12 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn } if (type == ov::ISyncInferRequest::FoundPort::Type::OUTPUT) { - const auto& outMap = m_graph->GetOutputNodesMap(); - auto output = outMap.find(port_index); - OPENVINO_ASSERT(output != outMap.end(), - "Tensor with index: ", - port_index, - " exists in CPU plugin graph, but absents in model outputs"); + auto output = graph.getOutputNodeByIndex(port_index); + OPENVINO_ASSERT(output, "Tensor with index: ", port_index, " absent in the plugin's graph outputs"); if (m_outputs.find(port_index) == m_outputs.end()) { const auto& port = m_output_ports_map[port_index]; const auto& port_shape = port.get_partial_shape(); - const auto& graph_shape = output->second->getInputShapeAtPort(0); + const auto& graph_shape = output->getInputShapeAtPort(0); // WA, due to the transformations and constant folding, shape inference of the resulting model may // have static shapes, while they are dynamic in the initial representation @@ -541,22 +528,22 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn } dnnl::engine eng(dnnl::engine::kind::cpu, 0); - CpuBlockedMemoryDescPtr desc = std::make_shared(model_prec, Shape{memDims}); + CpuBlockedMemoryDescPtr desc = + std::make_shared(model_prec, Shape{memDims}); auto memory = std::make_shared(eng, desc); tensor = std::make_shared(memory); } else { - const auto graph_prec = - output->second->getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); + const auto graph_prec = output->getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); OutputControlBlock control_block{model_prec, Shape{shape}}; DEBUG_LOG(port_index, - ", tensor ", - control_block.tensor(), - ", memBlock ", - control_block.tensor()->get_memory()->getMemoryBlock(), - "memory object ", - control_block.tensor()->get_memory().get()); + ", tensor ", + control_block.tensor(), + ", memBlock ", + control_block.tensor()->get_memory()->getMemoryBlock(), + "memory object ", + control_block.tensor()->get_memory().get()); tensor = control_block.tensor(); if (model_prec == graph_prec) @@ -571,7 +558,7 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn m_outputs[port_index] = tensor; if (!port_shape.is_dynamic() && !m_output_external_ptr.count(port_index)) { auto desc = MemoryDescUtils::generateCpuBlockedMemoryDesc(tensor); - if (desc->isCompatible(output->second->getParentEdgeAt(0)->getMemory().getDesc())) { + if (desc->isCompatible(output->getParentEdgeAt(0)->getMemory().getDesc())) { m_output_external_ptr[port_index] = tensor; } } @@ -589,10 +576,10 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn return; } -void SyncInferRequest::push_input_data() { +void SyncInferRequest::push_input_data(Graph& graph) { for (auto& input : m_input_ports_map) { auto tensor = get_tensor(input.second); - m_graph->PushInputData(input.first, tensor); + graph.PushInputData(input.first, tensor); } } @@ -602,7 +589,7 @@ SyncInferRequest::OutputControlBlock::OutputControlBlock(const ov::element::Type m_proxyMemBlock = std::make_shared(m_buffers[m_buffIndx]); VectorDims memDims; - if (shape.isDynamic()) { // this is a WA since the ITensor doesn't allow dyn shapes + if (shape.isDynamic()) { // this is a WA since the ITensor doesn't allow dyn shapes for (auto&& item : shape.getDims()) { memDims.push_back(item != Shape::UNDEFINED_DIM ? item : 0); } @@ -610,8 +597,7 @@ SyncInferRequest::OutputControlBlock::OutputControlBlock(const ov::element::Type memDims = shape.getStaticDims(); } - CpuBlockedMemoryDescPtr desc = - std::make_shared(precision, Shape{memDims}); + CpuBlockedMemoryDescPtr desc = std::make_shared(precision, Shape{memDims}); auto memory = std::make_shared(eng, desc, m_proxyMemBlock); m_tensor = std::make_shared(memory); @@ -649,6 +635,5 @@ void SyncInferRequest::sub_streams_infer() { } } -} // namespace intel_cpu -} // namespace ov - +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/infer_request.h b/src/plugins/intel_cpu/src/infer_request.h index a9def63d359744..daae553dff2ea4 100644 --- a/src/plugins/intel_cpu/src/infer_request.h +++ b/src/plugins/intel_cpu/src/infer_request.h @@ -4,22 +4,21 @@ #pragma once -#include "graph.h" +#include "compiled_model.h" #include "cpu_tensor.h" +#include "graph.h" +#include "memory_state.h" #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/isync_infer_request.hpp" -#include "memory_state.h" namespace ov { namespace intel_cpu { -class CompiledModel; class AsyncInferRequest; class SyncInferRequest : public ov::ISyncInferRequest { public: - SyncInferRequest(std::shared_ptr compiled_model); - virtual ~SyncInferRequest(); + SyncInferRequest(CompiledModelHolder compiled_model); void infer() override; @@ -29,7 +28,8 @@ class SyncInferRequest : public ov::ISyncInferRequest { void set_tensor(const ov::Output& port, const ov::SoPtr& tensor) override; - void set_tensors_impl(const ov::Output port, const std::vector>& tensors) override; + void set_tensors_impl(const ov::Output port, + const std::vector>& tensors) override; ov::SoPtr get_tensor(const ov::Output& port) const override; std::vector> get_tensors(const ov::Output& _port) const override; @@ -96,11 +96,10 @@ class SyncInferRequest : public ov::ISyncInferRequest { void create_infer_request(); void init_tensor(const std::size_t& port_index, const ov::ISyncInferRequest::FoundPort::Type& type); - void push_input_data(); - void redefine_memory_for_input_nodes(); - void assign_states(); + void push_input_data(Graph& graph); + void redefine_memory_for_input_nodes(Graph& graph); void update_external_tensor_ptrs(); - void change_default_ptr(); + void change_default_ptr(Graph& graph); const ov::Output& get_internal_port(const ov::Output& port) const; @@ -109,14 +108,13 @@ class SyncInferRequest : public ov::ISyncInferRequest { private: std::unordered_map m_outputControlBlocks; - Graph* m_graph = nullptr; std::unordered_map> m_input_external_ptr; std::unordered_map> m_output_external_ptr; - std::shared_ptr m_compiled_model; openvino::itt::handle_t m_profiling_task; std::vector m_memory_states; AsyncInferRequest* m_asyncRequest = nullptr; + CompiledModelHolder m_compiled_model; std::unordered_map> m_input_ports_map; std::unordered_map> m_output_ports_map; diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp index 0f202c296891c1..26cd8459458b9d 100644 --- a/src/plugins/intel_cpu/src/memory_control.cpp +++ b/src/plugins/intel_cpu/src/memory_control.cpp @@ -16,8 +16,7 @@ namespace { class StaticPartitionMemoryBlock : public IMemoryBlockObserver { public: - StaticPartitionMemoryBlock(MemoryBlockPtr pBlock, ptrdiff_t offset) - : m_pBlock(pBlock), m_offset(offset) { + StaticPartitionMemoryBlock(MemoryBlockPtr pBlock, ptrdiff_t offset) : m_pBlock(pBlock), m_offset(offset) { OPENVINO_ASSERT(m_pBlock, "Memory block is uninitialized"); } @@ -92,7 +91,7 @@ class IMemoryManager { using MemoryManagerPtr = std::shared_ptr; -template +template std::shared_ptr makeDnnlMemoryBlock(Args&&... args) { return std::make_shared(make_unique(std::forward(args)...)); } @@ -152,10 +151,12 @@ class MemoryManagerStatic : public IMemoryManager { } void allocate() override { - if (m_workspace) m_workspace->resize(m_totalSize); + if (m_workspace) + m_workspace->resize(m_totalSize); } void release() override { - if (m_workspace) m_workspace->free(); + if (m_workspace) + m_workspace->free(); } private: @@ -171,14 +172,13 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { void insert(const MemoryRegion& reg) override { MemorySolver::Box box = {reg.start, reg.finish, reg.size, reg.id}; if (-1 != reg.finish) { - //We have to extend the lifespan of tensors that are crossing a sync point border in order to save - //the intermediate computation results from possible loss due to the tensor resize - auto itr_upper = - std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { - return y <= x; - }); + // We have to extend the lifespan of tensors that are crossing a sync point border in order to save + // the intermediate computation results from possible loss due to the tensor resize + auto itr_upper = std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { + return y <= x; + }); auto itr_lower = std::lower_bound(m_syncInds.begin(), m_syncInds.end(), box.start); - if (itr_lower != itr_upper) { // across sections + if (itr_lower != itr_upper) { // across sections if (itr_upper == m_syncInds.end()) { box.finish = -1; } else { @@ -201,7 +201,7 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { void solve() { ov::MemorySolver::normalize_boxes(m_boxes); - std::vector> groups; //groups of nonoverlapping boxes + std::vector> groups; // groups of nonoverlapping boxes groups.push_back({m_boxes.front()}); for (size_t i = 1; i < m_boxes.size(); ++i) { const auto& box = m_boxes[i]; @@ -229,7 +229,7 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { } void allocate() override { - //nothing to do + // nothing to do } void release() override { for (auto&& item : m_internalBlocks) { @@ -305,15 +305,17 @@ MemoryControl::MemoryControl(std::vector syncInds) { })); // handler for static tensors - m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { - if (reg.size >= 0 || MemoryRegion::RegionType::VARIABLE != reg.type || - MemoryRegion::AllocType::POD != reg.alloc_type) { - return false; - } - return true; - }, std::move(syncInds))); + m_handlers.emplace_back(buildHandler( + [](const MemoryRegion& reg) { + if (reg.size >= 0 || MemoryRegion::RegionType::VARIABLE != reg.type || + MemoryRegion::AllocType::POD != reg.alloc_type) { + return false; + } + return true; + }, + std::move(syncInds))); - //handler for I/O tensors, so far simply individual blocks + // handler for I/O tensors, so far simply individual blocks m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { if (MemoryRegion::RegionType::VARIABLE == reg.type || reg.alloc_type != MemoryRegion::AllocType::POD) { return false; diff --git a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp index 4b75d5c5263398..7dff6905df09d9 100644 --- a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp @@ -15,9 +15,9 @@ namespace intel_cpu { constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::FULL_MASK; constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::EMPTY_MASK; constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::SKIP_OFFSET_MASK; -constexpr size_t BlockedMemoryDesc::OFFSET_MASK_POS; +constexpr size_t BlockedMemoryDesc::OFFSET_MASK_POS; -bool BlockedMemoryDesc::isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool BlockedMemoryDesc::isCompatibleInternal(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const { if (this->getShape() != rhs.getShape() || this->getPrecision() != rhs.getPrecision()) return false; @@ -77,5 +77,5 @@ std::string BlockedMemoryDesc::serializeFormat() const { return result.str(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h index d938a4ba585602..9ff132965bdc0b 100644 --- a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h @@ -21,7 +21,7 @@ class BlockedMemoryDesc : public virtual MemoryDesc { static constexpr CmpMask FULL_MASK{0xffffffff}; static constexpr CmpMask EMPTY_MASK{0x0}; static constexpr CmpMask SKIP_OFFSET_MASK{0x7fffffff}; - static constexpr size_t OFFSET_MASK_POS{31}; + static constexpr size_t OFFSET_MASK_POS{31}; /** * @brief Returns the blocked dimensions @@ -73,7 +73,7 @@ class BlockedMemoryDesc : public virtual MemoryDesc { * * @return the result of the compatibility check */ - virtual bool isCompatible(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const = 0; + virtual bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const = 0; using MemoryDesc::isCompatible; ~BlockedMemoryDesc() override = default; @@ -88,7 +88,7 @@ class BlockedMemoryDesc : public virtual MemoryDesc { * Doesn't perform descs specific attributes check * @return true if compatible, otherwise false */ - bool isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; + bool isCompatibleInternal(const BlockedMemoryDesc& rhs, CmpMask cmpMask = FULL_MASK) const; mutable VectorDims blockedDims; mutable VectorDims strides; @@ -99,5 +99,5 @@ class BlockedMemoryDesc : public virtual MemoryDesc { using BlockedMemoryDescPtr = std::shared_ptr; using BlockedMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp index d1c50d0048c57d..c95463207a9c46 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp @@ -3,6 +3,7 @@ // #include "cpu_blocked_memory_desc.h" + #include "dnnl_blocked_memory_desc.h" #include "utils/general_utils.h" @@ -15,17 +16,27 @@ static VectorDims makeRange(size_t size) { return retVec; } -CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape) : - CpuBlockedMemoryDesc(prc, shape, shape.getDims(), makeRange(shape.getDims().size())) {} - -CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding, const VectorDims& offsetPaddingToData, - const VectorDims& strides) : MemoryDesc(shape, Blocked), precision(prc) { - if (std::any_of(order.begin(), order.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { +CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape) + : CpuBlockedMemoryDesc(prc, shape, shape.getDims(), makeRange(shape.getDims().size())) {} + +CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding, + const VectorDims& offsetPaddingToData, + const VectorDims& strides) + : MemoryDesc(shape, Blocked), + precision(prc) { + if (std::any_of(order.begin(), order.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { OPENVINO_THROW("CpuBlockedMemoryDesc do not support undefined order."); } - if (std::any_of(blockedDims.begin() + shape.getRank(), blockedDims.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { + if (std::any_of(blockedDims.begin() + shape.getRank(), blockedDims.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { OPENVINO_THROW("CpuBlockedMemoryDesc doesn't support undefined blockedDims."); } @@ -51,29 +62,43 @@ CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& s if (strides.empty() && !order.empty()) { if (shape.hasZeroDims()) { this->strides.resize(order.size(), 0); - } else if (std::any_of(this->blockedDims.begin(), this->blockedDims.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { + } else if (std::any_of(this->blockedDims.begin(), this->blockedDims.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { this->strides.resize(order.size(), Shape::UNDEFINED_DIM); } else { this->strides.resize(order.size(), 1); for (size_t i = 2; i <= order.size(); i++) { - this->strides[order.size() - i] = this->strides[order.size() - (i - 1)] * this->blockedDims[blockedDims.size() - (i - 1)]; + this->strides[order.size() - i] = + this->strides[order.size() - (i - 1)] * this->blockedDims[blockedDims.size() - (i - 1)]; } } } else { this->strides = strides; } - if (!everyone_is(this->order.size(), this->blockedDims.size(), this->offsetPaddingToData.size(), this->strides.size())) { + if (!everyone_is(this->order.size(), + this->blockedDims.size(), + this->offsetPaddingToData.size(), + this->strides.size())) { OPENVINO_THROW("Order, blocked dims, offset padding to data and strides must have equals size"); } } bool CpuBlockedMemoryDesc::isDefinedImp() const { bool defined = true; - defined = defined && std::none_of(blockedDims.cbegin(), blockedDims.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); - defined = defined && std::none_of(strides.cbegin(), strides.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); - defined = defined && std::none_of(order.cbegin(), order.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); - defined = defined && std::none_of(offsetPaddingToData.cbegin(), offsetPaddingToData.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); + defined = defined && std::none_of(blockedDims.cbegin(), blockedDims.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); + defined = defined && std::none_of(strides.cbegin(), strides.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); + defined = defined && std::none_of(order.cbegin(), order.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); + defined = defined && std::none_of(offsetPaddingToData.cbegin(), offsetPaddingToData.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); defined = defined && offsetPadding != Shape::UNDEFINED_DIM; return defined; @@ -90,15 +115,15 @@ bool CpuBlockedMemoryDesc::isCompatible(const MemoryDesc& rhs) const { } } -bool CpuBlockedMemoryDesc::isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool CpuBlockedMemoryDesc::isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask) const { return BlockedMemoryDesc::isCompatibleInternal(rhs, cmpMask); } -bool CpuBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool CpuBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask) const { return rhs.isCompatible(*this, cmpMask); } -bool CpuBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool CpuBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const { const BlockedMemoryDesc* pRhs = &rhs; if (auto cpuBlkDesc = dynamic_cast(pRhs)) { return isCompatible(*cpuBlkDesc, cmpMask); @@ -149,7 +174,9 @@ size_t CpuBlockedMemoryDesc::getMaxMemSize() const { } const auto& maxDims = shape.getMaxDims(); - if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { + if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { return UNDEFINED_SIZE; } @@ -193,16 +220,16 @@ size_t CpuBlockedMemoryDesc::getElementOffset(size_t elemNumber) const { bool CpuBlockedMemoryDesc::hasLayoutType(LayoutType layoutType) const { switch (layoutType) { - case LayoutType::ncsp: - return isPlainFormat(); - case LayoutType::nspc: - return isTailCFormat(); - case LayoutType::nCsp8c: - return isBlockedCFormat(8); - case LayoutType::nCsp16c: - return isBlockedCFormat(16); - default: - return false; + case LayoutType::ncsp: + return isPlainFormat(); + case LayoutType::nspc: + return isTailCFormat(); + case LayoutType::nCsp8c: + return isBlockedCFormat(8); + case LayoutType::nCsp16c: + return isBlockedCFormat(16); + default: + return false; } } @@ -252,13 +279,15 @@ bool CpuBlockedMemoryDesc::isTailCFormat() const { return true; } -MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) const { - if (std::any_of(dims.begin(), dims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { +MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims& dims) const { + if (std::any_of(dims.begin(), dims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { OPENVINO_THROW("Can't clone desc if new dims are undefined"); } // TODO [DS]: add stride recalculation for strided blobs - for (int i = strides.size() - 2; i >= 0 ; i--) { + for (int i = strides.size() - 2; i >= 0; i--) { if (strides[i] == Shape::UNDEFINED_DIM) break; @@ -280,11 +309,18 @@ MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) } VectorDims newOffsetPaddingToData; - if (std::none_of(offsetPaddingToData.begin(), offsetPaddingToData.end(), [](size_t x){ return x == Shape::UNDEFINED_DIM;})) { + if (std::none_of(offsetPaddingToData.begin(), offsetPaddingToData.end(), [](size_t x) { + return x == Shape::UNDEFINED_DIM; + })) { newOffsetPaddingToData = offsetPaddingToData; } - return std::make_shared(precision, Shape(dims), newBlockedDims, order, offsetPadding, newOffsetPaddingToData); + return std::make_shared(precision, + Shape(dims), + newBlockedDims, + order, + offsetPadding, + newOffsetPaddingToData); } bool CpuBlockedMemoryDesc::blocksExtended() const { @@ -311,7 +347,9 @@ size_t CpuBlockedMemoryDesc::getPaddedElementsCount() const { if (getShape().hasZeroDims()) { return 0; } - if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim dim) { return dim == Shape::UNDEFINED_DIM; })) { + if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim dim) { + return dim == Shape::UNDEFINED_DIM; + })) { OPENVINO_THROW("Can't compute padded elements count for non undefined blocked dims"); } return std::accumulate(blockedDims.begin(), blockedDims.end(), size_t{1}, std::multiplies()); @@ -323,5 +361,5 @@ MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewPrecision(const ov::element::Typ return newDesc; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h index 28badb4dac15fb..fdf931a262e854 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h @@ -16,8 +16,12 @@ class CpuBlockedMemoryDesc : public BlockedMemoryDesc { public: CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape); - CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding = 0, const VectorDims& offsetPaddingToData = {}, + CpuBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding = 0, + const VectorDims& offsetPaddingToData = {}, const VectorDims& strides = {}); MemoryDescPtr clone() const override { @@ -26,8 +30,8 @@ class CpuBlockedMemoryDesc : public BlockedMemoryDesc { bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const override; - bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; - bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; + bool isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; + bool isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; ov::element::Type getPrecision() const override { return precision; @@ -105,5 +109,5 @@ class CpuBlockedMemoryDesc : public BlockedMemoryDesc { using CpuBlockedMemoryDescPtr = std::shared_ptr; using CpuBlockedMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h index c3936528abed7b..e6d260066118ee 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h @@ -22,7 +22,7 @@ namespace ov { namespace intel_cpu { namespace node { class Split; -} // namespace node +} // namespace node class MemoryDesc; @@ -39,10 +39,10 @@ enum MemoryDescType { }; enum class LayoutType : unsigned { - nspc, // general per channels format - ncsp, // general planar - nCsp8c, // general channels blocked by 8 - nCsp16c // general channels blocked by 16 + nspc, // general per channels format + ncsp, // general planar + nCsp8c, // general channels blocked by 8 + nCsp16c // general channels blocked by 16 }; class MemoryDesc { @@ -70,8 +70,8 @@ class MemoryDesc { /** * @brief Clone descriptor with new dims. - * Throws an exception if relaxedCheck is false and some of the new dims conflicts with the internal shape (i.e. its defined dims ,rank, upper bounds) - * or if internal shape and dims have different ranks + * Throws an exception if relaxedCheck is false and some of the new dims conflicts with the internal shape (i.e. its + * defined dims ,rank, upper bounds) or if internal shape and dims have different ranks * @param dims new dims * @param relaxedCheck flag which defined must we check dims with internal desc on compatibility * @return MemoryDescPtr with new dims @@ -136,8 +136,8 @@ class MemoryDesc { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> T* as() { T* casted = dynamic_cast(this); if (!casted) @@ -146,8 +146,8 @@ class MemoryDesc { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> const T* as() const { const T* casted = dynamic_cast(this); if (!casted) @@ -159,17 +159,16 @@ class MemoryDesc { protected: MemoryDesc() : type(MemoryDescType::Undef) {} - MemoryDesc(Shape shape, MemoryDescType type) - : type(type), shape(std::move(shape)) {} + MemoryDesc(Shape shape, MemoryDescType type) : type(type), shape(std::move(shape)) {} - MemoryDesc(const VectorDims& dims, MemoryDescType type) - : type(type), shape(dims) {} + MemoryDesc(const VectorDims& dims, MemoryDescType type) : type(type), shape(dims) {} virtual void setPrecision(ov::element::Type prc) = 0; virtual size_t getCurrentMemSizeImp() const = 0; - // Get offset to the n'th element. Returns physical index of the element by the logical one considering padding, layout, blocking etc. + // Get offset to the n'th element. Returns physical index of the element by the logical one considering padding, + // layout, blocking etc. virtual size_t getElementOffset(size_t elemNumber) const = 0; virtual bool canComputeMemSizeZeroDims() const = 0; @@ -195,5 +194,5 @@ class MemoryDesc { friend class node::Split; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp index 0ae17d6c00322b..2937b73409b67d 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp @@ -4,29 +4,33 @@ #include "memory_desc/cpu_memory_desc_utils.h" -#include "memory_desc/cpu_blocked_memory_desc.h" -#include "memory_desc/dnnl_blocked_memory_desc.h" -#include "graph_context.h" -#include "cpu_memory_desc.h" -#include "memory_desc/empty_memory_desc.h" -#include -#include #include #include + #include #include +#include "cpu_memory_desc.h" +#include "graph_context.h" +#include "memory_desc/cpu_blocked_memory_desc.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "memory_desc/empty_memory_desc.h" + using namespace dnnl; namespace ov { namespace intel_cpu { -DnnlMemoryDescPtr MemoryDescUtils::convertToDnnlMemoryDesc(const MemoryDescPtr &desc) { +DnnlMemoryDescPtr MemoryDescUtils::convertToDnnlMemoryDesc(const MemoryDescPtr& desc) { if (MemoryDescType::Blocked == desc->getType()) { const auto cpuDesc = desc->as(); - return std::shared_ptr(new DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), cpuDesc->getShape(), cpuDesc->getBlockDims(), - cpuDesc->getOrder(), cpuDesc->getOffsetPadding(), - cpuDesc->getOffsetPaddingToData(), cpuDesc->getStrides())); + return std::shared_ptr(new DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), + cpuDesc->getShape(), + cpuDesc->getBlockDims(), + cpuDesc->getOrder(), + cpuDesc->getOffsetPadding(), + cpuDesc->getOffsetPaddingToData(), + cpuDesc->getStrides())); } else if (MemoryDescType::Empty == desc->getType()) { return DnnlExtensionUtils::makeDescriptor(dnnl::memory::desc()); } else if (MemoryDescType::Dnnl & desc->getType()) { @@ -41,14 +45,19 @@ DnnlBlockedMemoryDesc MemoryDescUtils::convertToDnnlBlockedMemoryDesc(const Memo return DnnlBlockedMemoryDesc(*desc.as()); } else if (MemoryDescType::Blocked == desc.getType()) { const auto cpuDesc = desc.as(); - return DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), cpuDesc->getShape(), cpuDesc->getBlockDims(), cpuDesc->getOrder(), cpuDesc->getOffsetPadding(), - cpuDesc->getOffsetPaddingToData(), cpuDesc->getStrides()); + return DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), + cpuDesc->getShape(), + cpuDesc->getBlockDims(), + cpuDesc->getOrder(), + cpuDesc->getOffsetPadding(), + cpuDesc->getOffsetPaddingToData(), + cpuDesc->getStrides()); } else { OPENVINO_THROW("Cannot convert MemoryDesc to DnnlBlockedMemoryDesc"); } } -BlockedMemoryDescPtr MemoryDescUtils::convertToBlockedMemoryDesc(const MemoryDescPtr &desc) { +BlockedMemoryDescPtr MemoryDescUtils::convertToBlockedMemoryDesc(const MemoryDescPtr& desc) { if (desc->getType() & MemoryDescType::Blocked) { return std::dynamic_pointer_cast(desc); } else { @@ -57,7 +66,7 @@ BlockedMemoryDescPtr MemoryDescUtils::convertToBlockedMemoryDesc(const MemoryDes } CpuBlockedMemoryDescPtr MemoryDescUtils::generateCpuBlockedMemoryDesc(const ov::SoPtr& tensor) { - const auto& shape = tensor->get_shape().empty() ? ov::Shape{tensor->get_size()} : tensor->get_shape(); + const auto& shape = tensor->get_shape().empty() ? ov::Shape{tensor->get_size()} : tensor->get_shape(); VectorDims blk_order(shape.size()); std::iota(blk_order.begin(), blk_order.end(), 0); @@ -87,17 +96,16 @@ CpuBlockedMemoryDescPtr MemoryDescUtils::generateCpuBlockedMemoryDesc(const ov:: }); } - return std::make_shared( - element_type, - Shape{shape}, - shape, - blk_order, - 0UL, - VectorDims{}, - blk_strides); + return std::make_shared(element_type, + Shape{shape}, + shape, + blk_order, + 0UL, + VectorDims{}, + blk_strides); } -std::shared_ptr MemoryDescUtils::makeDummyDesc(const MemoryDesc &desc, Dim dummyVal) { +std::shared_ptr MemoryDescUtils::makeDummyDesc(const MemoryDesc& desc, Dim dummyVal) { auto dummyShape = makeDummyShape(desc.getShape(), dummyVal); return desc.cloneWithNewDims(dummyShape.getStaticDims()); } @@ -111,7 +119,7 @@ std::shared_ptr MemoryDescUtils::makeEmptyMemory(const GraphContext::CP return std::make_shared(context->getEngine(), makeEmptyDesc(), nullptr); } -Shape MemoryDescUtils::makeDummyShape(const Shape &shape, Dim dummyVal) { +Shape MemoryDescUtils::makeDummyShape(const Shape& shape, Dim dummyVal) { const auto& minDims = shape.getMinDims(); const auto& maxDims = shape.getMaxDims(); const auto& dims = shape.getDims(); @@ -122,7 +130,7 @@ Shape MemoryDescUtils::makeDummyShape(const Shape &shape, Dim dummyVal) { return Shape(dummyDims); } -Shape MemoryDescUtils::makeDummyShape(const Shape &shape, const VectorDims& dummyVals) { +Shape MemoryDescUtils::makeDummyShape(const Shape& shape, const VectorDims& dummyVals) { if (shape.getRank() != dummyVals.size()) { OPENVINO_THROW("makeDummyShape(): dummyVals vector size and shape ranks mismatch"); } @@ -131,9 +139,10 @@ Shape MemoryDescUtils::makeDummyShape(const Shape &shape, const VectorDims& dumm const auto& dims = shape.getDims(); VectorDims dummyDims(dims.size()); for (size_t i = 0; i < dims.size(); ++i) { - dummyDims[i] = dims[i] == Shape::UNDEFINED_DIM ? std::min(maxDims[i], std::max(minDims[i], dummyVals[i])) : dims[i]; + dummyDims[i] = + dims[i] == Shape::UNDEFINED_DIM ? std::min(maxDims[i], std::max(minDims[i], dummyVals[i])) : dims[i]; } return Shape(dummyDims); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h index a4acd3eb2aa778..388c9a21c5df8e 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h @@ -5,11 +5,12 @@ #pragma once #include + #include "cpu_shape.h" #include "cpu_types.h" +#include "graph_context.h" #include "openvino/runtime/itensor.hpp" #include "openvino/runtime/so_ptr.hpp" -#include "graph_context.h" namespace ov { namespace intel_cpu { @@ -32,7 +33,7 @@ class MemoryDescUtils { * @param desc MemoryDesc to be converted * @return converted DnnlMemoryDesc */ - static std::shared_ptr convertToDnnlMemoryDesc(const std::shared_ptr &desc); + static std::shared_ptr convertToDnnlMemoryDesc(const std::shared_ptr& desc); /** * @brief Converts MemoryDesc to DnnlBlockedMemoryDesc @@ -46,7 +47,7 @@ class MemoryDescUtils { * @param desc MemoryDesc to be converted * @return converted BlockedMemoryDesc */ - static std::shared_ptr convertToBlockedMemoryDesc(const std::shared_ptr &desc); + static std::shared_ptr convertToBlockedMemoryDesc(const std::shared_ptr& desc); /** * @brief Builds CpuBlockedMemoryDesc for given ov::ITensor @@ -58,7 +59,8 @@ class MemoryDescUtils { static constexpr Dim DEFAULT_DUMMY_VAL = 64; /** - * @brief Makes a dummy descriptor where all undefined values are replaced with the smallest value between the parameter and the upper bound dim + * @brief Makes a dummy descriptor where all undefined values are replaced with the smallest value between the + * parameter and the upper bound dim * @param desc MemoryDesc from which the new descriptor is generated * @param dummyVal Dim value to replace undefined dimensions * @return a new MemoryDesc with dummy values instead of undefined dims @@ -66,27 +68,29 @@ class MemoryDescUtils { static std::shared_ptr makeDummyDesc(const MemoryDesc& desc, Dim dummyVal = DEFAULT_DUMMY_VAL); /** - * @brief Make an empty memory descriptor - * @note Shape{0}, undefined - * @return empty memory descriptor - */ + * @brief Make an empty memory descriptor + * @note Shape{0}, undefined + * @return empty memory descriptor + */ static std::shared_ptr makeEmptyDesc(); static std::shared_ptr makeEmptyMemory(const GraphContext::CPtr context); /** - * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the parameter and the upper bound dim - * @param shape a Shape object from which the new static shape is generated - * @param dummyVal Dim value to replace undefined dimensions - * @return a new Shape with dummy values instead of undefined dims - */ + * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the + * parameter and the upper bound dim + * @param shape a Shape object from which the new static shape is generated + * @param dummyVal Dim value to replace undefined dimensions + * @return a new Shape with dummy values instead of undefined dims + */ static Shape makeDummyShape(const Shape& shape, Dim dummyVal = DEFAULT_DUMMY_VAL); /** - * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the parameter and the upper bound dim - * @param shape a Shape object from which the new static shape is generated - * @param dummyVals vector of values to replace undefined dimensions - * @return a new Shape with dummy values instead of undefined dims - */ + * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the + * parameter and the upper bound dim + * @param shape a Shape object from which the new static shape is generated + * @param dummyVals vector of values to replace undefined dimensions + * @return a new Shape with dummy values instead of undefined dims + */ static Shape makeDummyShape(const Shape& shape, const VectorDims& dummyVals); /** @@ -104,5 +108,5 @@ class MemoryDescUtils { static std::string dims2str(const VectorDims& dims); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp index a24b55831c2c7c..38c020674c7168 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp @@ -4,26 +4,28 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" +#include #include +#include #include + #include "cpu_types.h" #include "dnnl_extension_utils.h" #include "memory_desc/cpu_blocked_memory_desc.h" #include "utils/general_utils.h" -#include -#include - namespace ov { namespace intel_cpu { DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& strides) : MemoryDesc(shape, DnnlBlocked) { const auto ndims = shape.getRank(); - const auto &dims = shape.getDims(); + const auto& dims = shape.getDims(); - if (!strides.empty()) { // custom strides - if (shape.hasZeroDims() && std::any_of(strides.begin(), strides.end(), [](size_t stride) { return stride != 0; } )) { + if (!strides.empty()) { // custom strides + if (shape.hasZeroDims() && std::any_of(strides.begin(), strides.end(), [](size_t stride) { + return stride != 0; + })) { OPENVINO_THROW("Can't create DnnlBlockedMemoryDesc with zero dim, but with non zero strides"); } desc = {DnnlExtensionUtils::convertToDnnlDims(dims), @@ -33,16 +35,20 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& dnnl::memory::dims plain_strides; if (shape.hasZeroDims()) { plain_strides.resize(ndims, 0); - } else if (std::any_of(dims.begin(), dims.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { + } else if (std::any_of(dims.begin(), dims.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { plain_strides.resize(ndims, DNNL_RUNTIME_DIM_VAL); } else { plain_strides.resize(ndims, 1); for (size_t i = 1; i < ndims; i++) { - plain_strides[ndims - i -1] = plain_strides[ndims - i] * dims[ndims - i]; + plain_strides[ndims - i - 1] = plain_strides[ndims - i] * dims[ndims - i]; } } - desc = {DnnlExtensionUtils::convertToDnnlDims(dims), DnnlExtensionUtils::ElementTypeToDataType(prc), plain_strides}; + desc = {DnnlExtensionUtils::convertToDnnlDims(dims), + DnnlExtensionUtils::ElementTypeToDataType(prc), + plain_strides}; } order.resize(ndims); @@ -55,11 +61,12 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& * Construct from blocked parameters * * OV IOhw_4i16o4i dims(N) = {32, 64, 128, 128} - * blockedDims {4, 2, 128, 128, 4, 16, 4} // total dims(inner, outermost, auto blocked/padded). Generally sorted by strides. - * strides {8388608, 4194304, 32768, 256, 64, 4, 1} // strides for blockedDims, growing sequence - * order {1, 0, 2, 3, 1, 0, 1} // matching to original dims + * blockedDims {4, 2, 128, 128, 4, 16, 4} // total dims(inner, outermost, auto blocked/padded). + * Generally sorted by strides. strides {8388608, 4194304, 32768, 256, 64, 4, 1} // strides for blockedDims, + * growing sequence order {1, 0, 2, 3, 1, 0, 1} // matching to original dims * - * All vectors blockedDims/strides/order have same size equals total num of internal blocked dims(inner_dims + outer_dims) + * All vectors blockedDims/strides/order have same size equals total num of internal blocked dims(inner_dims + + * outer_dims) * * Tensor descriptor filing is not deterministic. It allows any permutation of index which keeps order of * real dims spliting. @@ -70,9 +77,14 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& * * Limitation of conversion first N elements of order should be permutation of [0,1,2 ... N] */ -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding, const VectorDims& offsetPaddingToData, - const VectorDims& strides) : MemoryDesc(shape, DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding, + const VectorDims& offsetPaddingToData, + const VectorDims& strides) + : MemoryDesc(shape, DnnlBlocked) { using namespace dnnl; // scalar case if (shape.getRank() == 0) { @@ -128,7 +140,9 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& const bool emptyDesc = shape.hasZeroDims(); if (!strides.empty()) { - if (emptyDesc && std::any_of(strides.begin(), strides.end(), [](size_t dim) { return dim != 0; } )) { + if (emptyDesc && std::any_of(strides.begin(), strides.end(), [](size_t dim) { + return dim != 0; + })) { OPENVINO_THROW("Can't create DnnlBlockedMemoryDesc with zero dim, but with non zero strides"); } @@ -143,7 +157,9 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& OPENVINO_THROW("Can not construct DnnlBlockedMemoryDesc from strides: ", vec2str(strides)); } - if (!strides.empty() && !emptyDesc && std::none_of(strides.begin(), strides.end(), [](size_t x) { return Shape::UNDEFINED_DIM == x; })) { + if (!strides.empty() && !emptyDesc && std::none_of(strides.begin(), strides.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { bool inner_block_are_dense = one_of(strides.back(), 0u, 1u); // stride 1 - is dense case, 0 - broad casted for (size_t i = outer_ndims; i < strides.size() - 1; i++) { inner_block_are_dense &= (strides[i] == strides[i + 1] * blockedDims[i + 1]); @@ -164,8 +180,10 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& std::copy(dims.begin(), dims.end(), desc.get()->dims); if (!offsetPaddingToData.empty()) { - bool inner_pad_offsets_is_zero = std::all_of(offsetPaddingToData.begin() + outer_ndims, offsetPaddingToData.end(), - [](size_t pad) { return pad == 0; }); + bool inner_pad_offsets_is_zero = + std::all_of(offsetPaddingToData.begin() + outer_ndims, offsetPaddingToData.end(), [](size_t pad) { + return pad == 0; + }); if (!inner_pad_offsets_is_zero) OPENVINO_THROW("Can not construct DnnlBlockedMemoryDesc, inner pad offsets is not zero: ", @@ -189,7 +207,7 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& } // Fill blocking desc - auto &dnn_blk_desc = desc.get()->format_desc.blocking; + auto& dnn_blk_desc = desc.get()->format_desc.blocking; dnn_blk_desc.inner_nblks = inner_ndims; std::copy(dnnlBlkDims.end() - inner_ndims, dnnlBlkDims.end(), dnn_blk_desc.inner_blks); std::copy(order.end() - inner_ndims, order.end(), dnn_blk_desc.inner_idxs); @@ -209,8 +227,10 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& } } -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const Shape& shape, dnnl::memory::data_type dataType, dnnl::memory::format_tag format) : - MemoryDesc(shape, DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const Shape& shape, + dnnl::memory::data_type dataType, + dnnl::memory::format_tag format) + : MemoryDesc(shape, DnnlBlocked) { using namespace dnnl; if (format == memory::format_tag::any || format == memory::format_tag::undef) OPENVINO_THROW("Unexpected: Can't create dnnl::desc with any or undef format"); @@ -249,7 +269,7 @@ bool DnnlBlockedMemoryDesc::isCompatible(const MemoryDesc& rhs) const { } } -bool DnnlBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool DnnlBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const { if (auto desc = dynamic_cast(&rhs)) { return isCompatible(*desc, cmpMask); } else if (auto desc = dynamic_cast(&rhs)) { @@ -261,7 +281,8 @@ bool DnnlBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc &rhs, CmpMask c bool DnnlBlockedMemoryDesc::isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask) const { dnnl::impl::memory_desc_wrapper wrapped(desc.get()); - return wrapped.extra().flags == dnnl_memory_extra_flag_none && BlockedMemoryDesc::isCompatibleInternal(rhs, cmpMask); + return wrapped.extra().flags == dnnl_memory_extra_flag_none && + BlockedMemoryDesc::isCompatibleInternal(rhs, cmpMask); } bool DnnlBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask) const { @@ -288,8 +309,10 @@ bool DnnlBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMa const auto thisExtra = wrappedThis.extra(); const auto rhsExtra = wrappedRhs.extra(); - return this->getOrder() == rhs.getOrder() && (thisExtra.flags == rhsExtra.flags && thisExtra.compensation_mask == rhsExtra.compensation_mask && - thisExtra.scale_adjust == rhsExtra.scale_adjust) && wrappedThis.similar_to(wrappedRhs, true, true, 0, true, checkOffset, stride_mask); + return this->getOrder() == rhs.getOrder() && + (thisExtra.flags == rhsExtra.flags && thisExtra.compensation_mask == rhsExtra.compensation_mask && + thisExtra.scale_adjust == rhsExtra.scale_adjust) && + wrappedThis.similar_to(wrappedRhs, true, true, 0, true, checkOffset, stride_mask); } static VectorDims extractOrder(const dnnl::memory::desc& desc) { @@ -300,7 +323,7 @@ static VectorDims extractOrder(const dnnl::memory::desc& desc) { OPENVINO_THROW("Unexpected: Cannot calculate order from undefined dims or strides"); } - const auto &blk_desc = descWrapped.blocking_desc(); + const auto& blk_desc = descWrapped.blocking_desc(); const size_t outer_ndims = dims.size(); const size_t inner_ndims = blk_desc.inner_nblks; @@ -319,11 +342,11 @@ static VectorDims extractOrder(const dnnl::memory::desc& desc) { // order of outer dims. In case of IOhw_ will be {1, 0, 2, 3} VectorDims outer_order(outer_ndims); std::iota(outer_order.begin(), outer_order.end(), 0); - std::sort(outer_order.begin(), outer_order.end(), - [&blk_desc, &outer_block_dims](size_t ind_l, size_t ind_r) { - return (blk_desc.strides[ind_l] > blk_desc.strides[ind_r]) || - (blk_desc.strides[ind_l] == blk_desc.strides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); - }); + std::sort(outer_order.begin(), outer_order.end(), [&blk_desc, &outer_block_dims](size_t ind_l, size_t ind_r) { + return (blk_desc.strides[ind_l] > blk_desc.strides[ind_r]) || + (blk_desc.strides[ind_l] == blk_desc.strides[ind_r] && + outer_block_dims[ind_l] > outer_block_dims[ind_r]); + }); // blocked order // [new_outer_order] U [inner_idxs] @@ -333,8 +356,8 @@ static VectorDims extractOrder(const dnnl::memory::desc& desc) { return blk_order; } -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc) : - MemoryDesc(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims), DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc) + : MemoryDesc(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims), DnnlBlocked) { desc = dnnl::memory::desc(DnnlExtensionUtils::clone_desc(cdesc)); if (desc.get_format_kind() == dnnl::memory::format_kind::any) @@ -356,16 +379,16 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc) : bool DnnlBlockedMemoryDesc::hasLayoutType(LayoutType layoutType) const { switch (layoutType) { - case LayoutType::ncsp: - return isPlainFormat(); - case LayoutType::nspc: - return isTailCFormat(); - case LayoutType::nCsp8c: - return isBlockedCFormat(8); - case LayoutType::nCsp16c: - return isBlockedCFormat(16); - default: - return false; + case LayoutType::ncsp: + return isPlainFormat(); + case LayoutType::nspc: + return isTailCFormat(); + case LayoutType::nCsp8c: + return isBlockedCFormat(8); + case LayoutType::nCsp16c: + return isBlockedCFormat(16); + default: + return false; } } @@ -382,8 +405,7 @@ bool DnnlBlockedMemoryDesc::isPlainFormat() const { } bool DnnlBlockedMemoryDesc::isBlockedCFormat(size_t blk_size) const { - if (desc.get_format_kind() != dnnl::memory::format_kind::blocked || - desc.get_inner_nblks() != 1 || + if (desc.get_format_kind() != dnnl::memory::format_kind::blocked || desc.get_inner_nblks() != 1 || desc.get_inner_idxs()[0] != 1) return false; @@ -452,13 +474,15 @@ static dnnl::memory::desc cloneDescWithNewDims(const dnnl::memory::desc& desc, return newMklDesc; } -MemoryDescPtr DnnlBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) const { - if (std::any_of(dims.begin(), dims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { +MemoryDescPtr DnnlBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims& dims) const { + if (std::any_of(dims.begin(), dims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { OPENVINO_THROW("Can't clone desc if new dims are undefined"); } // TODO [DS]: add stride recalculation for strided blobs - for (int i = strides.size() - 2; i >= 0 ; i--) { + for (int i = strides.size() - 2; i >= 0; i--) { if (strides[i] == Shape::UNDEFINED_DIM) break; @@ -499,7 +523,7 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { { const auto dims = desc.get_dims(); VectorDims total_block_per_dim(dims.size(), 1); - const auto &blk_desc = desc.get()->format_desc.blocking; + const auto& blk_desc = desc.get()->format_desc.blocking; for (int i = 0; i < blk_desc.inner_nblks; i++) { total_block_per_dim[blk_desc.inner_idxs[i]] *= blk_desc.inner_blks[i]; } @@ -509,10 +533,12 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { } std::iota(actualOrder.begin(), actualOrder.end(), 0); - std::sort(actualOrder.begin(), actualOrder.end(), - [&actualStrides, &outer_block_dims] (size_t ind_l, size_t ind_r) { + std::sort(actualOrder.begin(), + actualOrder.end(), + [&actualStrides, &outer_block_dims](size_t ind_l, size_t ind_r) { return (actualStrides[ind_l] > actualStrides[ind_r]) || - (actualStrides[ind_l] == actualStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); + (actualStrides[ind_l] == actualStrides[ind_r] && + outer_block_dims[ind_l] > outer_block_dims[ind_r]); }); } @@ -520,7 +546,7 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { { const auto dims = refDesc.get_dims(); VectorDims total_block_per_dim(dims.size(), 1); - const auto &blk_desc = refDesc.get()->format_desc.blocking; + const auto& blk_desc = refDesc.get()->format_desc.blocking; for (int i = 0; i < blk_desc.inner_nblks; i++) { total_block_per_dim[blk_desc.inner_idxs[i]] *= blk_desc.inner_blks[i]; } @@ -530,11 +556,10 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { } std::iota(refOrder.begin(), refOrder.end(), 0); - std::sort(refOrder.begin(), refOrder.end(), - [&refStrides, &outer_block_dims] (size_t ind_l, size_t ind_r) { - return (refStrides[ind_l] > refStrides[ind_r]) || - (refStrides[ind_l] == refStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); - }); + std::sort(refOrder.begin(), refOrder.end(), [&refStrides, &outer_block_dims](size_t ind_l, size_t ind_r) { + return (refStrides[ind_l] > refStrides[ind_r]) || + (refStrides[ind_l] == refStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); + }); } if (actualOrder != refOrder) { @@ -549,7 +574,9 @@ size_t DnnlBlockedMemoryDesc::getMaxMemSize() const { } const auto& maxDims = shape.getMaxDims(); - if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { + if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { return UNDEFINED_SIZE; } @@ -563,11 +590,13 @@ size_t DnnlBlockedMemoryDesc::getPaddedElementsCount() const { } auto padded_dims = desc.get_padded_dims(); - if (std::any_of(std::begin(padded_dims), std::begin(padded_dims) + desc.get_ndims(), - [](dnnl_dim_t dim) { return dim == DNNL_RUNTIME_DIM_VAL; })) { + if (std::any_of(std::begin(padded_dims), std::begin(padded_dims) + desc.get_ndims(), [](dnnl_dim_t dim) { + return dim == DNNL_RUNTIME_DIM_VAL; + })) { OPENVINO_THROW("Can't compute padded elements count for non undefined blocked dims"); } - return std::accumulate(std::begin(padded_dims), std::begin(padded_dims) + desc.get_ndims(), + return std::accumulate(std::begin(padded_dims), + std::begin(padded_dims) + desc.get_ndims(), size_t{1}, std::multiplies()); } @@ -586,7 +615,7 @@ void DnnlBlockedMemoryDesc::initBlockDims() { const auto dims = desc.get_dims(); const size_t outer_ndims = dims.size(); - const auto inner_ndims = desc.get_inner_nblks(); + const auto inner_ndims = desc.get_inner_nblks(); const size_t total_ndims = outer_ndims + inner_ndims; // total inner block size. in case of 4i16o4i will be {16, 16, 1, 1} @@ -612,10 +641,10 @@ void DnnlBlockedMemoryDesc::initBlockDims() { std::copy(order.begin(), order.begin() + outer_ndims, outer_order.begin()); blockedDims.resize(total_ndims, 0); - std::copy(inner_blks.begin(), inner_blks.begin() + inner_nblks, - blockedDims.end() - inner_nblks); - std::transform(outer_order.begin(), outer_order.end(), blockedDims.begin(), - [&] (size_t i) { return outer_block_dims[i]; }); + std::copy(inner_blks.begin(), inner_blks.begin() + inner_nblks, blockedDims.end() - inner_nblks); + std::transform(outer_order.begin(), outer_order.end(), blockedDims.begin(), [&](size_t i) { + return outer_block_dims[i]; + }); } void DnnlBlockedMemoryDesc::initStrides() { @@ -623,7 +652,7 @@ void DnnlBlockedMemoryDesc::initStrides() { const size_t outer_ndims = dims.size(); const size_t inner_nblks = desc.get_inner_nblks(); - const auto inner_blks = desc.get_inner_blks(); + const auto inner_blks = desc.get_inner_blks(); const size_t total_ndims = outer_ndims + inner_nblks; // strides of inner dims. In case of 4i16o4i will be {64, 4, 1} @@ -642,8 +671,9 @@ void DnnlBlockedMemoryDesc::initStrides() { std::copy(inner_strides.rbegin(), inner_strides.rend(), strides.rbegin()); const auto desc_strides = desc.get_strides(); - std::transform(outer_order.begin(), outer_order.end(), strides.begin(), - [&](size_t i) { return desc_strides[i] == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : desc_strides[i]; }); + std::transform(outer_order.begin(), outer_order.end(), strides.begin(), [&](size_t i) { + return desc_strides[i] == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : desc_strides[i]; + }); } void DnnlBlockedMemoryDesc::initOffsetPadding() { @@ -659,15 +689,17 @@ MemoryDescPtr DnnlBlockedMemoryDesc::cloneWithNewPrecision(const ov::element::Ty } void DnnlBlockedMemoryDesc::recomputeDefaultStrides() { - const auto &rank = getShape().getRank(); + const auto& rank = getShape().getRank(); if (order.size() != blockedDims.size()) OPENVINO_THROW("Can't recompute stride: order size != blocked dims size"); - auto &oneDnnStrides = desc.get()->format_desc.blocking.strides; + auto& oneDnnStrides = desc.get()->format_desc.blocking.strides; if (getShape().hasZeroDims()) { std::fill(std::begin(oneDnnStrides), std::begin(oneDnnStrides) + getShape().getRank(), 0); - } else if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim val) { return val == Shape::UNDEFINED_DIM; })) { + } else if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim val) { + return val == Shape::UNDEFINED_DIM; + })) { std::fill(std::begin(oneDnnStrides), std::begin(oneDnnStrides) + rank, DNNL_RUNTIME_DIM_VAL); initStrides(); } else { @@ -682,8 +714,8 @@ void DnnlBlockedMemoryDesc::recomputeDefaultStrides() { } } -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const dnnl::memory::desc& mdesc, const Shape& shape) : - MemoryDesc(shape, DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const dnnl::memory::desc& mdesc, const Shape& shape) + : MemoryDesc(shape, DnnlBlocked) { if (mdesc.get_format_kind() == dnnl::memory::format_kind::any) OPENVINO_THROW("Unexpected: Memory format any is prohibited!"); @@ -715,5 +747,5 @@ std::string DnnlBlockedMemoryDesc::serializeFormat() const { return BlockedMemoryDesc::serializeFormat(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h index a6c6a3297ba044..91388c12e2abf7 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h @@ -4,19 +4,20 @@ #pragma once +#include + +#include "dnnl_extension_utils.h" #include "dnnl_memory_desc.h" #include "memory_desc/blocked_memory_desc.h" #include "openvino/util/util.hpp" -#include "dnnl_extension_utils.h" -#include namespace ov { namespace intel_cpu { class CpuBlockedMemoryDesc; -OPENVINO_DISABLE_WARNING_MSVC_BEGIN(4250) // Visual Studio warns us about inheritance via dominance but it's done intentionally - // so turn it off +OPENVINO_DISABLE_WARNING_MSVC_BEGIN(4250) // Visual Studio warns us about inheritance via dominance but it's done + // intentionally so turn it off class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { public: // Creates planar DnnlBlockedMemoryDesc @@ -30,8 +31,8 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const override; - bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; - bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; + bool isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask = FULL_MASK) const; + bool isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask = FULL_MASK) const; const VectorDims& getBlockDims() const override { return blockedDims; @@ -63,17 +64,22 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { MemoryDescPtr cloneWithNewPrecision(const ov::element::Type prec) const override; - using DnnlMemoryDesc::setPrecision; using DnnlMemoryDesc::getPrecision; + using DnnlMemoryDesc::setPrecision; private: - DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding = 0, const VectorDims& offsetPaddingToData = {}, + DnnlBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding = 0, + const VectorDims& offsetPaddingToData = {}, const VectorDims& strides = {}); - // Creates DnnlBlockedMemoryDesc using the shape parameter as a true shape but all other params (layout, blocks, etc.) are used from the mdesc, but - // the mdesc own shape is ignored. The main purpose of this constructor is making dynamic descriptor from some dummy mdesc, which stores info about - // layout, blocking, strides, etc., and the provided dynamic shape. + // Creates DnnlBlockedMemoryDesc using the shape parameter as a true shape but all other params (layout, blocks, + // etc.) are used from the mdesc, but the mdesc own shape is ignored. The main purpose of this constructor is making + // dynamic descriptor from some dummy mdesc, which stores info about layout, blocking, strides, etc., and the + // provided dynamic shape. DnnlBlockedMemoryDesc(const dnnl::memory::desc& mdesc, const Shape& shape); explicit DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc); @@ -84,7 +90,8 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { bool isBlockedCFormat(size_t blk_size = UNREACHABLE_DIM) const; bool isTailCFormat() const; - // WA: we need to initialize blocked params into ctor to avoid bugs when we calculate these params in throughput mode + // WA: we need to initialize blocked params into ctor to avoid bugs when we calculate these params in throughput + // mode // TODO [DS]: should be reimplemented to avoid useless calculation void initBlockedParams() { initBlockDims(); @@ -99,7 +106,8 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { void recomputeDefaultStrides(); friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const_dnnl_memory_desc_t desc); - friend std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const dnnl::memory::desc &desc, const Shape& shape); + friend std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const dnnl::memory::desc& desc, + const Shape& shape); friend class MemoryDescUtils; }; OPENVINO_DISABLE_WARNING_MSVC_END(4250) @@ -107,5 +115,5 @@ OPENVINO_DISABLE_WARNING_MSVC_END(4250) using DnnlBlockedMemoryDescPtr = std::shared_ptr; using DnnlBlockedMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp index 3e3af41cfc523a..375b218272ed57 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp @@ -3,20 +3,21 @@ // #include "dnnl_memory_desc.h" -#include "dnnl_extension_utils.h" + #include #include + +#include "dnnl_extension_utils.h" #include "onednn/dnnl.h" namespace ov { namespace intel_cpu { -DnnlMemoryDesc::DnnlMemoryDesc(const dnnl::memory::desc& desc) : - DnnlMemoryDesc(desc.get()) {} +DnnlMemoryDesc::DnnlMemoryDesc(const dnnl::memory::desc& desc) : DnnlMemoryDesc(desc.get()) {} -DnnlMemoryDesc::DnnlMemoryDesc(const_dnnl_memory_desc_t cdesc) : - MemoryDesc(Shape(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims)), Dnnl), - desc(DnnlExtensionUtils::clone_desc(cdesc)) { +DnnlMemoryDesc::DnnlMemoryDesc(const_dnnl_memory_desc_t cdesc) + : MemoryDesc(Shape(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims)), Dnnl), + desc(DnnlExtensionUtils::clone_desc(cdesc)) { if (getFormatKind() == dnnl::memory::format_kind::any) OPENVINO_THROW("Unexpected: Memory format any is prohibited!"); } @@ -35,7 +36,7 @@ MemoryDescPtr DnnlMemoryDesc::cloneWithNewPrecision(const ov::element::Type prec return newDesc; } -bool DnnlMemoryDesc::isCompatible(const MemoryDesc &rhs) const { +bool DnnlMemoryDesc::isCompatible(const MemoryDesc& rhs) const { if (MemoryDescType::Dnnl & rhs.getType()) { auto* dnnMemDesc = rhs.as(); return isCompatible(*dnnMemDesc); @@ -52,17 +53,25 @@ std::string DnnlMemoryDesc::serializeFormat() const { dnnl::impl::memory_desc_wrapper wrapped(desc.get()); if (wrapped.is_wino_desc()) { switch (desc.get()->format_desc.wino_desc.wino_format) { - case dnnl::impl::wino_memory_format_t::wino_wei_aaOio: return "wino_aaOio"; - case dnnl::impl::wino_memory_format_t::wino_wei_aaOBiOo: return "wino_aaOBiOo"; - case dnnl::impl::wino_memory_format_t::wino_wei_OBaaIBOIio: return "wino_OBaaIBOIio"; - default: return "wino_undef"; + case dnnl::impl::wino_memory_format_t::wino_wei_aaOio: + return "wino_aaOio"; + case dnnl::impl::wino_memory_format_t::wino_wei_aaOBiOo: + return "wino_aaOBiOo"; + case dnnl::impl::wino_memory_format_t::wino_wei_OBaaIBOIio: + return "wino_OBaaIBOIio"; + default: + return "wino_undef"; } } else if (wrapped.is_rnn_packed_desc()) { switch (desc.get()->format_desc.rnn_packed_desc.format) { - case dnnl::impl::rnn_packed_format::ldigo_p: return "packed_ldigo"; - case dnnl::impl::rnn_packed_format::ldgoi_p: return "packed_ldgoi"; - case dnnl::impl::rnn_packed_format::ldio_p: return "packed_ldio"; - default: return "packed_undef"; + case dnnl::impl::rnn_packed_format::ldigo_p: + return "packed_ldigo"; + case dnnl::impl::rnn_packed_format::ldgoi_p: + return "packed_ldgoi"; + case dnnl::impl::rnn_packed_format::ldio_p: + return "packed_ldio"; + default: + return "packed_undef"; } } return "undef"; @@ -116,7 +125,7 @@ bool DnnlMemoryDesc::isDefinedImp() const { return wrappedThis.offset0() != DNNL_RUNTIME_DIM_VAL; } -MemoryDescPtr DnnlMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) const { +MemoryDescPtr DnnlMemoryDesc::cloneWithNewDimsImp(const VectorDims& dims) const { OPENVINO_THROW("Unexpected: Cannot clone non blocked oneDNN desc with new dims"); } @@ -125,6 +134,5 @@ size_t DnnlMemoryDesc::getOffsetPadding() const { return DnnlExtensionUtils::convertToDim(wrap.offset0()); } - -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h index f2722a5170f871..6b3692c5663078 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h @@ -4,11 +4,11 @@ #pragma once -#include "dnnl_extension_utils.h" #include #include -#include "memory_desc/cpu_memory_desc.h" + #include "dnnl_extension_utils.h" +#include "memory_desc/cpu_memory_desc.h" namespace ov { namespace intel_cpu { @@ -29,13 +29,17 @@ class DnnlMemoryDesc : public virtual MemoryDesc { bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const DnnlMemoryDesc& rhs) const; - bool hasLayoutType(LayoutType layoutType) const override { return false; } + bool hasLayoutType(LayoutType layoutType) const override { + return false; + } std::string serializeFormat() const override; size_t getMaxMemSize() const override; - virtual bool isSame(dnnl::memory::format_tag fmt) const { return false; } + virtual bool isSame(dnnl::memory::format_tag fmt) const { + return false; + } const dnnl::memory::desc& getDnnlDesc() const { return desc; @@ -70,10 +74,9 @@ class DnnlMemoryDesc : public virtual MemoryDesc { bool isDefinedImp() const override; MemoryDescPtr cloneWithNewDimsImp(const VectorDims& dims) const override; - friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc &desc); + friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc& desc); friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const_dnnl_memory_desc_t desc); }; -} // namespace intel_cpu -} // namespace ov - +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h index 4b641669262591..c26cc6aa33a251 100644 --- a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h @@ -5,7 +5,6 @@ #pragma once #include "cpu_memory_desc.h" - #include "cpu_shape.h" #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" @@ -23,8 +22,7 @@ namespace intel_cpu { */ class EmptyMemoryDesc : public MemoryDesc { public: - EmptyMemoryDesc(): - MemoryDesc(Shape{0}, Empty) { + EmptyMemoryDesc() : MemoryDesc(Shape{0}, Empty) { /* status never changes for an empty memory desc * so "define" beforehand to ensure isDefined() is thread safe */ status = MemoryDesc::descStatus::Defined; @@ -59,7 +57,11 @@ class EmptyMemoryDesc : public MemoryDesc { } MemoryDescPtr cloneWithNewPrecision(const ov::element::Type prec) const override { - OPENVINO_THROW("Clone an empty memory desc with any precision (", prec, ") is prohibited"); + OPENVINO_ASSERT(prec == ov::element::undefined, + "Clone an empty memory desc with defined precision: ", + prec, + " is prohibited"); + return clone(); } private: @@ -90,5 +92,5 @@ class EmptyMemoryDesc : public MemoryDesc { using EmptyMemoryDescPtr = std::shared_ptr; using EmptyMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_state.cpp b/src/plugins/intel_cpu/src/memory_state.cpp index aa06f4ebd82957..c0dc85c4103ce4 100644 --- a/src/plugins/intel_cpu/src/memory_state.cpp +++ b/src/plugins/intel_cpu/src/memory_state.cpp @@ -5,30 +5,33 @@ #include "memory_state.h" #include + #include "cpu_memory.h" +#include "cpu_tensor.h" +#include "dnnl_extension_utils.h" #include "memory_desc/cpu_blocked_memory_desc.h" #include "memory_desc/cpu_memory_desc_utils.h" -#include "dnnl_extension_utils.h" -#include "cpu_tensor.h" -#include "utils/plain_tensor.hpp" -#include "openvino/core/parallel.hpp" #include "nodes/common/cpu_convert.h" #include "nodes/kernels/scaled_attn/attn_quant.hpp" +#include "openvino/core/parallel.hpp" +#include "utils/plain_tensor.hpp" using namespace ov::Extensions::Cpu::XARCH; namespace ov { namespace intel_cpu { -VariableStateBase::VariableStateBase(const std::string& name, const MemoryDescPtr& external_desc) : - IVariableState{name} , m_external_desc{external_desc} {} +VariableStateBase::VariableStateBase(const std::string& name, const MemoryDescPtr& external_desc) + : IVariableState{name}, + m_external_desc{external_desc} {} MemoryDescPtr VariableStateBase::to_static(const MemoryDescPtr& desc) { if (!desc->isDefined()) { auto&& current_dims = desc->getShape().getDims(); VectorDims new_dims(current_dims.size()); std::transform(current_dims.begin(), current_dims.end(), new_dims.begin(), [](Dim x) { - return x == Shape::UNDEFINED_DIM ? 0 : x; }); + return x == Shape::UNDEFINED_DIM ? 0 : x; + }); return desc->cloneWithNewDims(new_dims, true); } @@ -71,21 +74,26 @@ ov::SoPtr VariableStateBase::get_state() const { return std::make_shared(internal_state_mem()); } - //test precision + // test precision { auto internal_prc = current_internal_desc->getPrecision(); auto tmp_desc = current_ext_desc->cloneWithNewPrecision(internal_prc); if (tmp_desc->isCompatible(*current_internal_desc)) { auto mem = std::make_shared(get_engine(), current_ext_desc); - size_t elements_to_convert = internal_state_mem()->getDescWithType()->getPaddedElementsCount(); + size_t elements_to_convert = + internal_state_mem()->getDescWithType()->getPaddedElementsCount(); auto external_prc = current_ext_desc->getPrecision(); - cpu_convert(internal_state_mem()->getData(), mem->getData(), internal_prc, external_prc, elements_to_convert); + cpu_convert(internal_state_mem()->getData(), + mem->getData(), + internal_prc, + external_prc, + elements_to_convert); return std::make_shared(mem); } } - //reorder + // reorder auto mem = std::make_shared(get_engine(), current_ext_desc); mem->load(*(internal_state_mem())); return std::make_shared(mem); @@ -108,19 +116,19 @@ void VariableStateBase::commit() { VariableStateDoubleBuffer::VariableStateDoubleBuffer(const std::string& name, const MemoryPtr& first_buffer, const MemoryPtr& second_buffer, - const MemoryDescPtr& external_desc) : - VariableStateBase(name, external_desc) { + const MemoryDescPtr& external_desc) + : VariableStateBase(name, external_desc) { OPENVINO_ASSERT(first_buffer && second_buffer); reset_prime_mem(first_buffer); reset_second_mem(second_buffer); m_internal_desc = prime_mem()->getDescPtr(); auto&& shape = m_internal_desc->getShape(); - //TODO what if by some reason we already have internal static state while the node is dynamic, is it even possible? + // TODO what if by some reason we already have internal static state while the node is dynamic, is it even possible? if (shape.isStatic()) { prime_mem()->nullify(); } else { - //in the case of the original desc has dynamic shape we create an empty tensor + // in the case of the original desc has dynamic shape we create an empty tensor auto new_desc = to_static(m_internal_desc); prime_mem()->redefineDesc(new_desc); } @@ -199,11 +207,11 @@ void VariableStateSingleBuffer::commit_impl() { // nothing to do } -VariableStateKVcache::VariableStateKVcache( - const std::string& name, - const MemoryDescPtr& external_desc, - const BlockedMemoryDescPtr& dense_internal_desc) : - VariableStateBase(name, external_desc), m_dense_internal_desc(dense_internal_desc) { +VariableStateKVcache::VariableStateKVcache(const std::string& name, + const MemoryDescPtr& external_desc, + const BlockedMemoryDescPtr& dense_internal_desc) + : VariableStateBase(name, external_desc), + m_dense_internal_desc(dense_internal_desc) { auto&& shape = external_desc->getShape(); OPENVINO_ASSERT(shape.isDynamic(), "VariableStateKVcache is unexpectedly initalized with a static tensor"); @@ -227,7 +235,7 @@ ov::SoPtr VariableStateKVcache::get_state() const { OPENVINO_ASSERT(actual_external_desc->getShape().getRank() == 4); auto&& actual_internal_order = actual_internal_desc->getOrder(); - //sanity check + // sanity check OPENVINO_ASSERT(actual_internal_order == m_dense_internal_desc->getOrder()); PlainTensor output, pastkv, beam_table; @@ -253,20 +261,12 @@ ov::SoPtr VariableStateKVcache::get_state() const { S, m_scale_zp.ptr(m, b_kv, h)[0], m_scale_zp.ptr(m, b_kv, h)[1]); - cpu_convert(buffers[ithr].ptr(), - output.ptr_v(m, b, h), - element::f32, - output.m_dt, - S); + cpu_convert(buffers[ithr].ptr(), output.ptr_v(m, b, h), element::f32, output.m_dt, S); }); } else { parallel_for3d(L0, B, H, [&](size_t m, size_t b, size_t h) { auto b_kv = static_cast(beam_table.at({b, m})); - cpu_convert(pastkv.ptr_v(m, b_kv, h), - output.ptr_v(m, b, h), - pastkv.m_dt, - output.m_dt, - S); + cpu_convert(pastkv.ptr_v(m, b_kv, h), output.ptr_v(m, b, h), pastkv.m_dt, output.m_dt, S); }); } @@ -274,11 +274,11 @@ ov::SoPtr VariableStateKVcache::get_state() const { } void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { - //1. reset the memory object - m_state = state; // simply to extend the lifetime + // 1. reset the memory object + m_state = state; // simply to extend the lifetime auto state_desc = MemoryDescUtils::generateCpuBlockedMemoryDesc(m_state); - //May be optimized by reusing the state tensor underlining memory pointer, but corner cases should be considered + // May be optimized by reusing the state tensor underlining memory pointer, but corner cases should be considered auto dense_internal_desc = m_dense_internal_desc->cloneWithNewDims(state_desc->getShape().getStaticDims()); m_internal_mem = std::make_shared(get_engine(), dense_internal_desc); @@ -287,7 +287,10 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { if (dense_internal_desc->getPrecision() == element::u8) { PlainTensor external, internal; auto&& actual_internal_order = m_dense_internal_desc->getOrder(); - external.resize(external_mem.getStaticDims(), state_desc->getPrecision().size(), state_desc->getPrecision(), m_state->data()); + external.resize(external_mem.getStaticDims(), + state_desc->getPrecision().size(), + state_desc->getPrecision(), + m_state->data()); internal.reset(m_internal_mem); external = external.permute(actual_internal_order); internal = internal.permute(actual_internal_order); @@ -300,11 +303,7 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { m_scale_zp.resize({L0, B, H, 2}); parallel_for3d(B, H, L0, [&](size_t ithr, size_t b, size_t h, size_t m) { buffers[ithr].resize({S}); - cpu_convert(external.ptr_v(m, b, h), - buffers[ithr].ptr(), - external.m_dt, - element::f32, - S); + cpu_convert(external.ptr_v(m, b, h), buffers[ithr].ptr(), external.m_dt, element::f32, S); attn_quant_u8(buffers[ithr].ptr(), internal.ptr(m, b, h), S, @@ -315,14 +314,13 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { m_internal_mem->load(external_mem); } - //2. Reset the beam search table + // 2. Reset the beam search table auto&& state_dims = dense_internal_desc->getShape().getStaticDims(); auto&& order = m_dense_internal_desc->getOrder(); const size_t size_B = state_dims[order.at(1)]; const size_t size_L = state_dims[order.at(0)]; - auto mem_desc = - std::make_shared(ov::element::i32, Shape{size_B, size_L}); + auto mem_desc = std::make_shared(ov::element::i32, Shape{size_B, size_L}); m_hidden_state = std::make_shared(get_engine(), mem_desc); auto buff = m_hidden_state->getDataAs(); @@ -336,11 +334,11 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { } void VariableStateKVcache::reset_impl() { - //nothing to do + // nothing to do } void VariableStateKVcache::commit_impl() { - //nothing to do + // nothing to do } MemoryPtr VariableStateKVcache::input_mem() { @@ -352,7 +350,7 @@ MemoryPtr VariableStateKVcache::output_mem() { } MemoryDescPtr VariableStateKVcache::internal_desc() const { - return m_dense_internal_desc; //since we don't store initial one + return m_dense_internal_desc; // since we don't store initial one } MemoryPtr VariableStateKVcache::internal_state_mem() const { diff --git a/src/plugins/intel_cpu/src/memory_state.h b/src/plugins/intel_cpu/src/memory_state.h index e7493f327e93fa..f35e78989b02f8 100644 --- a/src/plugins/intel_cpu/src/memory_state.h +++ b/src/plugins/intel_cpu/src/memory_state.h @@ -29,12 +29,12 @@ class VariableStateBase : public IVariableState { public: VariableStateBase(const std::string& name, const MemoryDescPtr& external_desc); - //ov::IVariableState - void set_state(const ov::SoPtr& state) override final; // NOLINT + // ov::IVariableState + void set_state(const ov::SoPtr& state) override final; // NOLINT ov::SoPtr get_state() const override; - void reset() override final; // NOLINT - bool is_reset_state() const override final; // NOLINT - void commit() override final; // NOLINT + void reset() override final; // NOLINT + bool is_reset_state() const override final; // NOLINT + void commit() override final; // NOLINT protected: virtual MemoryPtr internal_state_mem() const = 0; @@ -66,7 +66,7 @@ class VariableStateDoubleBuffer : public VariableStateBase { MemoryDescPtr internal_desc() const override; private: - //ov::intel_cpu::VariableStateBase + // ov::intel_cpu::VariableStateBase void reset_impl() override; void commit_impl() override; @@ -89,7 +89,7 @@ class VariableStateDoubleBuffer : public VariableStateBase { MemoryPtr internal_state_mem() const override; private: - MemoryDescPtr m_internal_desc; //mem desc required by the graph internal tensor + MemoryDescPtr m_internal_desc; // mem desc required by the graph internal tensor std::array m_internal_mem{}; size_t buffer_num = 0; }; @@ -111,7 +111,7 @@ class VariableStateSingleBuffer : public VariableStateBase { MemoryPtr internal_state_mem() const override; private: - MemoryDescPtr m_internal_desc; //mem desc required by the graph internal tensor + MemoryDescPtr m_internal_desc; // mem desc required by the graph internal tensor MemoryPtr m_internal_mem; }; @@ -121,10 +121,10 @@ class VariableStateKVcache : public VariableStateBase { const MemoryDescPtr& external_desc, const BlockedMemoryDescPtr& dense_internal_desc); - //ov::IVariableState + // ov::IVariableState ov::SoPtr get_state() const override; - //ov::intel_cpu::VariableStateBase + // ov::intel_cpu::VariableStateBase MemoryPtr input_mem() override; MemoryPtr output_mem() override; MemoryDescPtr internal_desc() const override; @@ -158,14 +158,14 @@ class VariableStateKVcache : public VariableStateBase { } private: - //ov::intel_cpu::VariableStateBase + // ov::intel_cpu::VariableStateBase void set_state_impl(const ov::SoPtr& state) override; void reset_impl() override; void commit_impl() override; private: - MemoryPtr m_internal_mem; // kv cache - MemoryPtr m_hidden_state; // beam access table + MemoryPtr m_internal_mem; // kv cache + MemoryPtr m_hidden_state; // beam access table size_t m_internal_mem_max_size = 0; size_t m_hidden_state_max_size = 0; @@ -178,5 +178,5 @@ class VariableStateKVcache : public VariableStateBase { using MemStatePtr = std::shared_ptr; using MemStateCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/mlas/thread_pool.hpp b/src/plugins/intel_cpu/src/mlas/thread_pool.hpp index 536b3746be1d69..5af8b0cce915fa 100644 --- a/src/plugins/intel_cpu/src/mlas/thread_pool.hpp +++ b/src/plugins/intel_cpu/src/mlas/thread_pool.hpp @@ -7,6 +7,7 @@ #include #include #include + #include "mlas.h" namespace ov { @@ -17,6 +18,7 @@ class OVMlasThreadPool : public IMlasThreadPool { explicit OVMlasThreadPool(const size_t& threadNum) : threadNum(threadNum) {} size_t DegreeOfParallelism() override; void TrySimpleParallelFor(const std::ptrdiff_t total, const std::function& fn) override; + public: // the actual threads used for sgemm size_t threadNum = 0; diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index f4c2b0eb686df6..ddf8d068f920a2 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -3,38 +3,38 @@ // #include "node.h" -#include "cpu_types.h" -#include "edge.h" -#include "partitioned_mem_blk.h" +#include +#include + +#include +#include +#include #include #include -#include +#include #include -#include #include +#include +#include "cpu_types.h" +#include "dnnl_extension_utils.h" +#include "edge.h" +#include "memory_desc/cpu_memory_desc_utils.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "nodes/common/cpu_convert.h" #include "nodes/conv.h" #include "nodes/eltwise.h" #include "nodes/input.h" -#include "nodes/reorder.h" #include "nodes/reference.h" -#include "dnnl_extension_utils.h" - +#include "nodes/reorder.h" +#include "openvino/core/type/element_type.hpp" +#include "partitioned_mem_blk.h" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" +#include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" #include "utils/rt_info/memory_formats_attribute.hpp" -#include - -#include -#include -#include "utils/general_utils.h" -#include "utils/cpu_utils.hpp" -#include "nodes/common/cpu_convert.h" -#include "memory_desc/cpu_memory_desc_utils.h" -#include "memory_desc/dnnl_blocked_memory_desc.h" -#include -#include using namespace dnnl; using namespace openvino; @@ -43,7 +43,7 @@ using namespace ov::intel_cpu::node; namespace ov { namespace intel_cpu { -Node::NodesFactory & Node::factory() { +Node::NodesFactory& Node::factory() { static NodesFactory factoryInstance; return factoryInstance; } @@ -62,7 +62,7 @@ Node::Node(const std::shared_ptr& op, type(TypeFromName(op->get_type_name())), profiling(op->get_friendly_name()) { for (size_t i = 0; i < op->get_input_size(); i++) { - const auto &shape = op->get_input_partial_shape(i); + const auto& shape = op->get_input_partial_shape(i); if (shape.rank().is_dynamic()) { OPENVINO_THROW("Unexpected: CPU plug-in doesn't support ", getTypeStr(), @@ -82,7 +82,7 @@ Node::Node(const std::shared_ptr& op, OPENVINO_THROW("Node with type '", typeStr, "' and name '", name, "' does not have any outputs."); } for (size_t i = 0; i < op->get_output_size(); i++) { - const auto &shape = op->get_output_partial_shape(i); + const auto& shape = op->get_output_partial_shape(i); if (shape.rank().is_dynamic()) { OPENVINO_THROW("Unexpected: CPU plug-in doesn't support ", getTypeStr(), @@ -98,8 +98,14 @@ Node::Node(const std::shared_ptr& op, childEdges.reserve(outputShapes.size()); } - isDynamic = std::any_of(inputShapes.begin(), inputShapes.end(), [](const Shape& shape){ return shape.isDynamic(); }) || - std::any_of(outputShapes.begin(), outputShapes.end(), [](const Shape& shape){ return shape.isDynamic(); }); + isDynamic = std::any_of(inputShapes.begin(), + inputShapes.end(), + [](const Shape& shape) { + return shape.isDynamic(); + }) || + std::any_of(outputShapes.begin(), outputShapes.end(), [](const Shape& shape) { + return shape.isDynamic(); + }); if (isDynamic) { shapeInference = shapeInferFactory.makeShapeInfer(); @@ -126,12 +132,13 @@ Node::Node(const std::shared_ptr& op, if (str.substr(0, 4) != "cpu:") continue; customImplPriorities.push_back(parse_impl_name(str)); - if (customImplPriorities.back() == impl_desc_type::unknown && - str != "cpu:unknown") + if (customImplPriorities.back() == impl_desc_type::unknown && str != "cpu:unknown") OPENVINO_THROW("Unsupported CPU implementation ", str, " for node ", getName()); } const auto& defaultImplPriorities = getDefaultImplPriority(); - customImplPriorities.insert(customImplPriorities.end(), defaultImplPriorities.begin(), defaultImplPriorities.end()); + customImplPriorities.insert(customImplPriorities.end(), + defaultImplPriorities.begin(), + defaultImplPriorities.end()); } std::string inputMemoryFormats = getInputMemoryFormats(op); @@ -198,10 +205,11 @@ void Node::addEdge(const EdgePtr& edge) { } void Node::remove() { - auto drop = [](std::vector edges){ + auto drop = [](std::vector edges) { for (auto& edge : edges) { auto edgePtr = edge.lock(); - if (!edgePtr) continue; + if (!edgePtr) + continue; edgePtr->getParent()->removeChildEdge(edgePtr); edgePtr->getChild()->removeParentEdge(edgePtr); } @@ -212,7 +220,7 @@ void Node::remove() { } bool Node::isEdgesEmpty(const std::vector& edges) const { - for (auto &edge : edges) { + for (auto& edge : edges) { if (edge.lock()) return false; } @@ -264,7 +272,8 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector& pr auto parentEdge = getParentEdgeAt(j); auto parentPtr = parentEdge->getParent(); - // We don't take into account constant edges since reorders on them will be executed on load network stage + // We don't take into account constant edges since reorders on them will be executed on load network + // stage if (ignoreConstInputs && j > 0 && parentPtr->isConstant()) { equalsLocalFormatCount++; continue; @@ -285,10 +294,20 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector& pr equalsLocalFormatCount++; } - DEBUG_LOG(getName(), " pd[", i, "].inConfs[", j, "]" - " is ", (isCompatible ? "compatible" : "not compatible"), - " with parent ", parentPtr->getName(), - " outConfs[", inNum, "], equalsLocalFormatCount add to ", equalsLocalFormatCount); + DEBUG_LOG(getName(), + " pd[", + i, + "].inConfs[", + j, + "]" + " is ", + (isCompatible ? "compatible" : "not compatible"), + " with parent ", + parentPtr->getName(), + " outConfs[", + inNum, + "], equalsLocalFormatCount add to ", + equalsLocalFormatCount); } if (equalsLocalFormatCount > equalsFormatCount) { @@ -333,7 +352,8 @@ bool Node::isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu:: return !(isOneDimShape1 && isOneDimShape2 && samePrec); } -void Node::selectPreferPrimitiveDescriptorWithShape(const std::vector& priority, bool ignoreConstInputs) { +void Node::selectPreferPrimitiveDescriptorWithShape(const std::vector& priority, + bool ignoreConstInputs) { // Filter out dynamic shape. if (isDynamic) { return selectPreferPrimitiveDescriptor(priority, ignoreConstInputs); @@ -370,11 +390,22 @@ void Node::selectPreferPrimitiveDescriptorWithShape(const std::vectorgetShape().toPartialShape()) ? "one dim shape" : "not one dim shape"), - " with parent ", parentPtr->getName(), - " outConfs[", inNum, "], estimate add to ", estimate); + DEBUG_LOG(getName(), + " pd[", + i, + "].inConfs[", + j, + "]" + " is ", + (isCompatible ? "compatible" : "not compatible"), + " shape is ", + (isOneDimShape(curDesc->getShape().toPartialShape()) ? "one dim shape" : "not one dim shape"), + " with parent ", + parentPtr->getName(), + " outConfs[", + inNum, + "], estimate add to ", + estimate); } } return estimate; @@ -442,7 +473,7 @@ bool Node::canBeInPlace() const { } if (getParentEdges().size() != 1 || getParentEdgeAt(0)->getParent()->getChildEdges().size() != 1 || - (getParentEdgeAt(0)->getParent()->isConstant() && !getParentEdgeAt(0)->getChild()->isConstant())) + (getParentEdgeAt(0)->getParent()->isConstant() && !getParentEdgeAt(0)->getChild()->isConstant())) return false; // TODO: we need to extend this logic to properly handle all possible inplace conflicts @@ -462,7 +493,7 @@ bool Node::canBeInPlace() const { } void Node::resolveInPlaceEdges(Edge::LOOK look) { - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (!selected_pd) OPENVINO_THROW("Cannot find selected primitive descriptor for node: ", getName()); if (look & Edge::LOOK_DOWN) { @@ -477,16 +508,19 @@ void Node::resolveInPlaceEdges(Edge::LOOK look) { " Unexpected inplace resolve call to an allocated edge: ", *parentEdge); - //search for already allocated edge + // search for already allocated edge const auto& childEdges = getChildEdgesAtPort(inplaceOutIndx); - auto itr = std::find_if(childEdges.begin(), childEdges.end(), [](const EdgePtr& edge) { return edge->getStatus() == Edge::Status::Allocated; }); + auto itr = std::find_if(childEdges.begin(), childEdges.end(), [](const EdgePtr& edge) { + return edge->getStatus() == Edge::Status::Allocated; + }); OPENVINO_ASSERT(itr != childEdges.end(), " Could not find an allocated edge to resolve in-place for node: ", getName()); auto baseMemBlock = (*itr)->getMemory().getMemoryBlock(); auto memBlock = std::make_shared(baseMemBlock); - auto newMem = std::make_shared(getEngine(), selected_pd->getConfig().inConfs[i].getMemDesc(), memBlock); + auto newMem = + std::make_shared(getEngine(), selected_pd->getConfig().inConfs[i].getMemDesc(), memBlock); parentEdge->reuse(newMem); } } @@ -505,7 +539,8 @@ void Node::resolveInPlaceEdges(Edge::LOOK look) { OPENVINO_ASSERT(childEdge->getStatus() == Edge::Status::NotAllocated, " Unexpected inplace resolve call to an allocated edge: ", *childEdge); - auto newMem = std::make_shared(getEngine(), selected_pd->getConfig().outConfs[i].getMemDesc(), memBlock); + auto newMem = + std::make_shared(getEngine(), selected_pd->getConfig().outConfs[i].getMemDesc(), memBlock); childEdge->reuse(newMem); } } @@ -565,9 +600,9 @@ std::string Node::getPrimitiveDescriptorType() const { str_type += t; }; -#define SEARCH_TYPE(_type) \ - if ((type & impl_desc_type::_type) == impl_desc_type::_type) \ - add_type(#_type) +#define SEARCH_TYPE(_type) \ + if ((type & impl_desc_type::_type) == impl_desc_type::_type) \ + add_type(#_type) SEARCH_TYPE(undef); SEARCH_TYPE(reorder); @@ -608,13 +643,19 @@ std::string Node::getPrimitiveDescriptorType() const { if (selectedPrimitiveDesc) { if (!selectedPrimitiveDesc->getConfig().inConfs.empty()) { if (selectedPrimitiveDesc->getConfig().inConfs[0].getMemDesc()->getPrecision() != ov::element::u8) { - str_type += "_" + std::string(selectedPrimitiveDesc->getConfig().inConfs[0].getMemDesc()->getPrecision().get_type_name()); + str_type += + "_" + + std::string( + selectedPrimitiveDesc->getConfig().inConfs[0].getMemDesc()->getPrecision().get_type_name()); } else { str_type += "_I8"; } } else { if (selectedPrimitiveDesc->getConfig().outConfs[0].getMemDesc()->getPrecision() != ov::element::u8) { - str_type += "_" + std::string(selectedPrimitiveDesc->getConfig().outConfs[0].getMemDesc()->getPrecision().get_type_name()); + str_type += + "_" + + std::string( + selectedPrimitiveDesc->getConfig().outConfs[0].getMemDesc()->getPrecision().get_type_name()); } else { str_type += "_I8"; } @@ -650,7 +691,7 @@ std::vector Node::getChildEdgesAtPort(int inputNum) const { OPENVINO_THROW("Node ", getName(), " contains less output ports than ", inputNum); std::vector res; - for (auto &edge_w : childEdges) { + for (auto& edge_w : childEdges) { auto edge = edge_w.lock(); if (!edge) OPENVINO_THROW("Node ", getName(), " contains dead weak ptr"); @@ -660,7 +701,7 @@ std::vector Node::getChildEdgesAtPort(int inputNum) const { return res; } -std::vector Node::getAvailableFormatsForDims(const Shape &dims) const { +std::vector Node::getAvailableFormatsForDims(const Shape& dims) const { if (dims.getRank() == 0) return {memory::format_tag::x}; else if (dims.getRank() == 1) @@ -668,8 +709,11 @@ std::vector Node::getAvailableFormatsForDims(const Shape &di else if (dims.getRank() == 2) return {memory::format_tag::nc}; else if (dims.getRank() == 3) - return {memory::format_tag::tnc, memory::format_tag::ntc, - memory::format_tag::ncw, memory::format_tag::nCw8c, memory::format_tag::nCw16c }; + return {memory::format_tag::tnc, + memory::format_tag::ntc, + memory::format_tag::ncw, + memory::format_tag::nCw8c, + memory::format_tag::nCw16c}; else if (dims.getRank() == 4) return {memory::format_tag::nchw, memory::format_tag::nChw8c, memory::format_tag::nChw16c}; else if (dims.getRank() == 5) @@ -694,36 +738,36 @@ void Node::updateShapes() { getTypeStr(), " with name: ", getName()); - try { - if (needShapeInfer()) { - auto result = shapeInfer(); - if (ShapeInferStatus::success == result.status) { - redefineOutputMemory(result.dims); + try { + if (needShapeInfer()) { + auto result = shapeInfer(); + if (ShapeInferStatus::success == result.status) { + redefineOutputMemory(result.dims); + } + } else { + // guard check for internal dynamic nodes to avoid possible overestimation of the required memory size + if (shapeInference && FULL_PORT_MASK == shapeInference->get_port_mask()) + return; + + for (auto&& edge : getChildEdges()) { + auto edge_ptr = edge.lock(); + CPU_NODE_ASSERT(edge_ptr, " has null edge"); + if (edge_ptr->inPlace(Edge::LOOK_UP)) { + continue; } - } else { - //guard check for internal dynamic nodes to avoid possible overestimation of the required memory size - if (shapeInference && FULL_PORT_MASK == shapeInference->get_port_mask()) - return; - - for (auto&& edge : getChildEdges()) { - auto edge_ptr = edge.lock(); - CPU_NODE_ASSERT(edge_ptr, " has null edge"); - if (edge_ptr->inPlace(Edge::LOOK_UP)) { - continue; - } - auto mem = edge_ptr->getMemoryPtr(); - CPU_NODE_ASSERT(mem, " has null output memory"); + auto mem = edge_ptr->getMemoryPtr(); + CPU_NODE_ASSERT(mem, " has null output memory"); - if (mem->getShape().hasZeroDims()) { - continue; - } - fetchRawMemory(mem); + if (mem->getShape().hasZeroDims()) { + continue; } + fetchRawMemory(mem); } - } catch (const std::exception& exp) { - THROW_CPU_NODE_ERR(exp.what()); } + } catch (const std::exception& exp) { + THROW_CPU_NODE_ERR(exp.what()); + } } void Node::updateDynamicParams() { @@ -735,10 +779,17 @@ void Node::updateDynamicParams() { try { if (isExecutable()) { if (needPrepareParams()) { - OPENVINO_ASSERT(inputShapesDefined(), - "Input shapes are not defined."); - DEBUG_LOG(" prepareParams() on #", getExecIndex(), " ", getTypeStr(), " ", algToString(getAlgorithm()), - " ", getName(), " ", getOriginalLayers()); + OPENVINO_ASSERT(inputShapesDefined(), "Input shapes are not defined."); + DEBUG_LOG(" prepareParams() on #", + getExecIndex(), + " ", + getTypeStr(), + " ", + algToString(getAlgorithm()), + " ", + getName(), + " ", + getOriginalLayers()); prepareParams(); } } @@ -781,7 +832,7 @@ bool Node::outputShapeDataDependency() const { return false; } -void Node::redefineOutputMemory(const std::vector &newOutputShapes) { +void Node::redefineOutputMemory(const std::vector& newOutputShapes) { if (newOutputShapes.size() != outputShapes.size()) { OPENVINO_THROW("Number shapes mismatch with real outputs number for node with name: ", getName()); } @@ -840,34 +891,45 @@ void Node::initSupportedPrimitiveDescriptors() { }; /* When custom implementation priorities are NOT defined it is enough to - * just use the first implementation from the priority list. - * When custom implementation priorities are defined, all the implementations should be considered, - * since custom implementations can be not available at all, so a fallback to the default ones must happen - * To achive the fallback, it is necessary to create a supported primitive descriptor for each implementation - * since oneDNN primitive is mutating while iterating */ + * just use the first implementation from the priority list. + * When custom implementation priorities are defined, all the implementations should be considered, + * since custom implementations can be not available at all, so a fallback to the default ones must happen + * To achive the fallback, it is necessary to create a supported primitive descriptor for each implementation + * since oneDNN primitive is mutating while iterating */ #ifdef CPU_DEBUG_CAPS { - if (!customImplPriorities.empty()) { - DEBUG_LOG("#", getName(), " customImplPriorities [", 0 , "/", customImplPriorities.size(), - "]: ", impl_type_to_string(customImplPriorities[0])); - } + if (!customImplPriorities.empty()) { + DEBUG_LOG("#", + getName(), + " customImplPriorities [", + 0, + "/", + customImplPriorities.size(), + "]: ", + impl_type_to_string(customImplPriorities[0])); + } } #endif for (auto& desc : descs) { auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); const bool first_match = customImplPriorities.empty(); - DEBUG_LOG("#", getName(), - ", itpd.impl_info_str(): ", desc.impl_info_str(), - ", parsed imp_type: ", impl_type_to_string(parse_impl_name(desc.impl_info_str())), - ", first_match: ", first_match ? "true" : "false"); - DnnlExtensionUtils::for_each_implementation(desc, - first_match, - [&](impl_desc_type implType) { - return contains(getImplPriority(), implType); - }, - [&](dnnl::primitive_desc& desc) { - addSupportedPrimitiveDescriptor(desc); - }); + DEBUG_LOG("#", + getName(), + ", itpd.impl_info_str(): ", + desc.impl_info_str(), + ", parsed imp_type: ", + impl_type_to_string(parse_impl_name(desc.impl_info_str())), + ", first_match: ", + first_match ? "true" : "false"); + DnnlExtensionUtils::for_each_implementation( + desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + [&](dnnl::primitive_desc& desc) { + addSupportedPrimitiveDescriptor(desc); + }); // fallback. if none of the primitive types is present in the priority list just add first implementation // @todo this fallback is not necessary if primitive priority list is filled correctly @@ -888,22 +950,29 @@ void Node::filterSupportedPrimitiveDescriptors() { }; auto isNotSuitableDesc = [&](const NodeDesc& desc) { - const auto &config = desc.getConfig(); - if (inputMemoryFormatsFilter.size() > config.inConfs.size() || outputMemoryFormatsFilter.size() > config.outConfs.size()) + const auto& config = desc.getConfig(); + if (inputMemoryFormatsFilter.size() > config.inConfs.size() || + outputMemoryFormatsFilter.size() > config.outConfs.size()) OPENVINO_THROW("Incorrect number of input or output memory formats"); for (size_t i = 0; i < inputMemoryFormatsFilter.size(); i++) { if (!areCompatible(*config.inConfs[i].getMemDesc(), inputMemoryFormatsFilter[i])) { - DEBUG_LOG(getName(), " input memory format filter: ", inputMemoryFormatsFilter[i], - " not matched. Erase desc from supported primitive descriptors: ", desc); + DEBUG_LOG(getName(), + " input memory format filter: ", + inputMemoryFormatsFilter[i], + " not matched. Erase desc from supported primitive descriptors: ", + desc); return true; } } for (size_t i = 0; i < outputMemoryFormatsFilter.size(); i++) { if (!areCompatible(*config.outConfs[i].getMemDesc(), outputMemoryFormatsFilter[i])) { - DEBUG_LOG(getName(), " Output memory format filter: ", outputMemoryFormatsFilter[i], - " not matched. Erase desc from supported primitive descriptors: ", desc); + DEBUG_LOG(getName(), + " Output memory format filter: ", + outputMemoryFormatsFilter[i], + " not matched. Erase desc from supported primitive descriptors: ", + desc); return true; } } @@ -931,7 +1000,8 @@ void Node::initDescriptor(const NodeConfig& config) { if (descs.empty()) { const auto& selectedConfig = selectedPD->getConfig(); - if (selectedConfig.inConfs.size() != config.inConfs.size() || selectedConfig.outConfs.size() != config.outConfs.size()) + if (selectedConfig.inConfs.size() != config.inConfs.size() || + selectedConfig.outConfs.size() != config.outConfs.size()) return; for (size_t i = 0; i < selectedConfig.inConfs.size(); i++) { @@ -948,19 +1018,19 @@ void Node::initDescriptor(const NodeConfig& config) { return; } - auto updateNodeConfig = [&](const NodeConfig& cfg){ + auto updateNodeConfig = [&](const NodeConfig& cfg) { auto updatedConfig = cfg; for (size_t i = 0; i < descInputNumbers(); i++) { PortConfig& dataConfig = updatedConfig.inConfs[i]; - dataConfig.inPlace(canBeInPlace() ? 0 : -1); // update inPlace - dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask + dataConfig.inPlace(canBeInPlace() ? 0 : -1); // update inPlace + dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask } for (size_t i = 0; i < descOutputNumbers(); i++) { PortConfig& dataConfig = updatedConfig.outConfs[i]; - dataConfig.inPlace(-1); // update inPlace - dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask + dataConfig.inPlace(-1); // update inPlace + dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask } return updatedConfig; @@ -1016,8 +1086,8 @@ void Node::prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx) { MemoryPtr ptr; auto weightCache = context->getWeightsCache(); if (weightCache != nullptr && memory::format_kind::blocked == intDesc->getDnnlDesc().get_format_kind()) { - const auto string_hash = - name + "_" + std::to_string(indx) + "_" + DnnlExtensionUtils::computeWeightsStringHash(internalBlob, intDesc); + const auto string_hash = name + "_" + std::to_string(indx) + "_" + + DnnlExtensionUtils::computeWeightsStringHash(internalBlob, intDesc); ptr = *weightCache->findOrCreate(string_hash, create); } else { ptr = create(); @@ -1042,7 +1112,7 @@ void Node::prepareMemory(const std::vector& intDescs) { void Node::prepareMemory(dnnl::primitive_desc_iterator& itpd) { std::vector intDescs; - for (auto &it : internalBlobDesc) + for (auto& it : internalBlobDesc) intDescs.push_back(it(itpd, 0)); Node::prepareMemory(intDescs); @@ -1062,8 +1132,8 @@ MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryD srcWeightDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc); } - auto create = [&] () { - Memory srcMemory{ getEngine(), srcWeightDesc, edgeMem->getData() }; + auto create = [&]() { + Memory srcMemory{getEngine(), srcWeightDesc, edgeMem->getData()}; MemoryPtr _ptr = std::make_shared(getEngine(), dstWeightDesc); node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache()); @@ -1106,7 +1176,7 @@ void Node::toNumaNodeImpl(int numaNodeID) { // create scratch pad from specified numa node if (scratchpadMem) { - scratchpadMem = context->getScratchPad(numaNodeID)->createScratchPadMem(scratchpadMem->getDescPtr()); + scratchpadMem = context->getScratchPad()->createScratchPadMem(scratchpadMem->getDescPtr()); primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->getPrimitive(); } @@ -1127,13 +1197,13 @@ bool Node::isInPlace() const { inplace = InPlaceType::NoInPlace; auto config = selected_pd->getConfig(); - for (auto &in : config.inConfs) { + for (auto& in : config.inConfs) { if (in.inPlace() >= 0) { inplace = InPlaceType::InPlace; break; } } - for (auto &out : config.outConfs) { + for (auto& out : config.outConfs) { if (out.inPlace() >= 0) { inplace = InPlaceType::InPlace; break; @@ -1164,7 +1234,7 @@ void Node::updateConstantType() { const auto prevConstantType = constant; constant = isConst ? ConstantType::Const : ConstantType::NoConst; if (constant == prevConstantType) - return; // state has not changed, no reason to continue + return; // state has not changed, no reason to continue for (const auto& childEdge : getChildEdges()) { const auto childNode = childEdge.lock()->getChild(); @@ -1173,7 +1243,8 @@ void Node::updateConstantType() { } void Node::addOriginalLayer(const std::string& layerName) { - if (layerName.empty()) return; + if (layerName.empty()) + return; if (originalLayers.empty()) { originalLayers = layerName; } else { @@ -1196,46 +1267,25 @@ void Node::cleanup() { const std::vector& Node::getDefaultImplPriority() { static const std::vector priorities { impl_desc_type::unknown, - // Undef impl type is used to express use-cases there real type is unkown during compilation - // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties - impl_desc_type::undef, - impl_desc_type::brgconv_avx512_amx_1x1, - impl_desc_type::brgconv_avx512_amx, - impl_desc_type::jit_avx512_amx_dw, - impl_desc_type::jit_avx512_amx_1x1, - impl_desc_type::jit_avx512_amx, - // Brgconv kernels disabled in order to prevent perf degradations on non AMX HW - // impl_desc_type::brgconv_avx512_1x1, - // impl_desc_type::brgconv_avx512, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, + // Undef impl type is used to express use-cases there real type is unkown during compilation + // Undef has higher priority than defined types in order to force primitive selection logic to make decision + // based on other properties + impl_desc_type::undef, impl_desc_type::brgconv_avx512_amx_1x1, impl_desc_type::brgconv_avx512_amx, + impl_desc_type::jit_avx512_amx_dw, impl_desc_type::jit_avx512_amx_1x1, impl_desc_type::jit_avx512_amx, + // Brgconv kernels disabled in order to prevent perf degradations on non AMX HW + // impl_desc_type::brgconv_avx512_1x1, + // impl_desc_type::brgconv_avx512, + impl_desc_type::jit_uni_dw, impl_desc_type::jit_uni_1x1, impl_desc_type::jit_uni, + impl_desc_type::jit_avx512_dw, impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, + impl_desc_type::jit_avx2_dw, impl_desc_type::jit_avx2_1x1, impl_desc_type::jit_avx2, + impl_desc_type::jit_avx_dw, impl_desc_type::jit_avx_1x1, impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, impl_desc_type::jit_sse42_1x1, impl_desc_type::jit_sse42, #if defined(OPENVINO_ARCH_ARM64) - impl_desc_type::jit_asimd, + impl_desc_type::jit_asimd, #endif - impl_desc_type::gemm_any, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::gemm_acl, - impl_desc_type::acl, - impl_desc_type::jit_gemm, - impl_desc_type::ref_any, - impl_desc_type::ref, + impl_desc_type::gemm_any, impl_desc_type::gemm_blas, impl_desc_type::gemm_avx512, impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, impl_desc_type::gemm_acl, impl_desc_type::acl, + impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref, }; return priorities; @@ -1245,30 +1295,31 @@ const std::vector& Node::getImplPriority() { if (!customImplPriorities.empty()) return customImplPriorities; - return getDefaultImplPriority(); } -PortDescBasePtr Node::getConsistentInputDesc(const NodeConfig &config, size_t idx) const { +PortDescBasePtr Node::getConsistentInputDesc(const NodeConfig& config, size_t idx) const { const auto& inConf = config.inConfs[idx]; - if (inConf.inPlace() >= 0) { // node have inplace input + if (inConf.inPlace() >= 0) { // node have inplace input auto inplaceIndx = static_cast(inConf.inPlace()); PortDescBasePtr outPortDesc; const auto& outConf = config.outConfs[inplaceIndx]; - if (outConf.inPlace() == static_cast(idx)) { // the input desc port is the same port used for inplace output - outPortDesc = outConf.getPortDesc(); // just use desc from this output port + if (outConf.inPlace() == + static_cast(idx)) { // the input desc port is the same port used for inplace output + outPortDesc = outConf.getPortDesc(); // just use desc from this output port } else { - outPortDesc = getConsistentOutputDesc(config, inplaceIndx); // get consistent desc otherwise + outPortDesc = getConsistentOutputDesc(config, inplaceIndx); // get consistent desc otherwise } - if (inConf.getPortDesc()->isCompatible(*outPortDesc)) { // use the desc if compatible + if (inConf.getPortDesc()->isCompatible(*outPortDesc)) { // use the desc if compatible return outPortDesc; } } - auto *parentSelectedPD = getParentEdgeAt(idx)->getParent()->getSelectedPrimitiveDescriptor(); + auto* parentSelectedPD = getParentEdgeAt(idx)->getParent()->getSelectedPrimitiveDescriptor(); if (!parentSelectedPD) - OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", getParentEdgeAt(idx)->getParent()->getName()); + OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", + getParentEdgeAt(idx)->getParent()->getName()); int num = getParentEdgeAt(idx)->getInputNum(); if (num >= 0) { @@ -1289,26 +1340,28 @@ PortDescBasePtr Node::getConsistentInputDesc(const NodeConfig &config, size_t id return inConf.getPortDesc(); } -PortDescBasePtr Node::getConsistentOutputDesc(const NodeConfig &config, size_t idx) const { +PortDescBasePtr Node::getConsistentOutputDesc(const NodeConfig& config, size_t idx) const { const auto& outConf = config.outConfs[idx]; - if (outConf.inPlace() >= 0) { // node have inplace output + if (outConf.inPlace() >= 0) { // node have inplace output auto inplaceIndx = static_cast(outConf.inPlace()); PortDescBasePtr inpPortDesc; const auto& inpConf = config.inConfs[inplaceIndx]; - if (inpConf.inPlace() == static_cast(idx)) { // the input desc port is the same port used for inplace output - inpPortDesc = inpConf.getPortDesc(); // just use desc from this output port + if (inpConf.inPlace() == + static_cast(idx)) { // the input desc port is the same port used for inplace output + inpPortDesc = inpConf.getPortDesc(); // just use desc from this output port } else { - inpPortDesc = getConsistentInputDesc(config, inplaceIndx); // get consistent desc otherwise + inpPortDesc = getConsistentInputDesc(config, inplaceIndx); // get consistent desc otherwise } - if (outConf.getPortDesc()->isCompatible(*inpPortDesc)) { // use the desc if compatible + if (outConf.getPortDesc()->isCompatible(*inpPortDesc)) { // use the desc if compatible return inpPortDesc; } } - auto *childSelectedPD = getChildEdgeAt(idx)->getChild()->getSelectedPrimitiveDescriptor(); + auto* childSelectedPD = getChildEdgeAt(idx)->getChild()->getSelectedPrimitiveDescriptor(); if (!childSelectedPD) - OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", getChildEdgeAt(idx)->getChild()->getName()); + OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", + getChildEdgeAt(idx)->getChild()->getName()); int num = getChildEdgeAt(idx)->getOutputNum(); if (num >= 0) { @@ -1330,7 +1383,7 @@ PortDescBasePtr Node::getConsistentOutputDesc(const NodeConfig &config, size_t i } void Node::initOptimalPrimitiveDescriptor() { - if (one_of(getType(), Type::RNNCell, Type::RNNSeq)) // can be skipped for RNN node + if (one_of(getType(), Type::RNNCell, Type::RNNSeq)) // can be skipped for RNN node return; auto selected_pd = getSelectedPrimitiveDescriptor(); @@ -1357,7 +1410,8 @@ void Node::initOptimalPrimitiveDescriptor() { // it is assumed that the nodes will define dense tensors on output edges // if it is not the case the implementation must redefine this behaviour if (outMemDesc->getType() & Blocked) { - config.outConfs[i].setMemDesc(std::dynamic_pointer_cast(outMemDesc), BlockedMemoryDesc::FULL_MASK); + config.outConfs[i].setMemDesc(std::dynamic_pointer_cast(outMemDesc), + BlockedMemoryDesc::FULL_MASK); } } } @@ -1365,9 +1419,9 @@ void Node::initOptimalPrimitiveDescriptor() { initDescriptor(config); } -bool Node::isConfigDefined(const NodeConfig &config) const { +bool Node::isConfigDefined(const NodeConfig& config) const { for (const auto& configs : {config.inConfs, config.outConfs}) { - for (const auto &dc : configs) { + for (const auto& dc : configs) { if (!dc.getMemDesc()->isDefined()) return false; } @@ -1375,14 +1429,14 @@ bool Node::isConfigDefined(const NodeConfig &config) const { return true; } -MemoryDescPtr Node::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +MemoryDescPtr Node::getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (getInputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(prim_desc.src_desc(idx), getInputShapeAtPort(idx)); } return DnnlExtensionUtils::makeDescriptor(prim_desc.src_desc(idx)); } -MemoryDescPtr Node::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +MemoryDescPtr Node::getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (getOutputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(prim_desc.dst_desc(idx), getOutputShapeAtPort(idx)); } @@ -1392,7 +1446,7 @@ MemoryDescPtr Node::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t void Node::appendPostOpArgs(const dnnl::primitive_attr& attr, std::unordered_map& primArgs, const std::unordered_map& postOpsArgs) { - for (auto & entry : postOpsArgs) { + for (auto& entry : postOpsArgs) { primArgs[entry.first] = entry.second->getPrimitive(); } } @@ -1425,11 +1479,17 @@ dnnl::memory::format_tag Node::getWeightsFormatTagByDims(const VectorDims& dims) } } -void Node::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis) { +void Node::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis) { OPENVINO_THROW("Fusing of ", NameFromType(this->getType()), " operation is not implemented"); } -void Node::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis) { +void Node::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis) { OPENVINO_THROW("Fusing of ", NameFromType(this->getType()), " operation is not implemented"); } @@ -1473,12 +1533,12 @@ ov::element::Type Node::getRuntimePrecision() const { } Node* Node::NodesFactory::create(const std::shared_ptr& op, const GraphContext::CPtr context) { - // getExceptionDescWithoutStatus removes redundant information from the exception message. For instance, the NotImplemented - // exception is generated in the form: full_path_to_src_file:line_number [ NOT_IMPLEMENTED ] reason. + // getExceptionDescWithoutStatus removes redundant information from the exception message. For instance, the + // NotImplemented exception is generated in the form: full_path_to_src_file:line_number [ NOT_IMPLEMENTED ] reason. // An example for gather node: - // /path-to-openVino-root/src/plugins/intel_cpu/nodes/gather.cpp:42 [ NOT_IMPLEMENTED ] Only opset7 Gather operation is supported - // The most important part of the message is the reason, so the lambda trims everything up to "]" - // Note that the op type and its friendly name will also be provided if we fail to create the node. + // /path-to-openVino-root/src/plugins/intel_cpu/nodes/gather.cpp:42 [ NOT_IMPLEMENTED ] Only opset7 Gather operation + // is supported The most important part of the message is the reason, so the lambda trims everything up to "]" Note + // that the op type and its friendly name will also be provided if we fail to create the node. auto getExceptionDescWithoutStatus = [](const ov::Exception& ex) { std::string desc = ex.what(); size_t pos = desc.find(']'); @@ -1491,7 +1551,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr& op, const Grap } return desc; }; - Node *newNode = nullptr; + Node* newNode = nullptr; std::string errorMessage; if (newNode == nullptr) { try { @@ -1538,7 +1598,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr& op, const Grap return newNode; } -bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { +bool Node::canBePerformedAsScaleShift(const Node* parentNode) const { #if defined(OPENVINO_ARCH_X86_64) OPENVINO_ASSERT(parentNode); @@ -1546,7 +1606,7 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { const auto channelAxis = parentNode->getFusingAxis(); for (size_t i = 0; i < getParentEdges().size(); i++) { - Node *node = getParentEdgeAt(i)->getParent().get(); + Node* node = getParentEdgeAt(i)->getParent().get(); if (node == nullptr) { OPENVINO_THROW("Cannot get parent node for ", getName(), " on ", i, " port"); } @@ -1574,7 +1634,7 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { const auto isConvertablePowerStatic = [&]() { if (getAlgorithm() == Algorithm::EltwisePowerStatic) { - const auto eltwise = dynamic_cast(this); + const auto eltwise = dynamic_cast(this); if (!eltwise) { OPENVINO_THROW("Cannot cast ", getName(), " to Eltwise"); } @@ -1583,13 +1643,15 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { return false; }; - return (one_of(getAlgorithm(), Algorithm::EltwiseAdd, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseDivide, - Algorithm::EltwisePrelu, - Algorithm::EltwiseMulAdd) && isBroadcastableToDataInput()) - || isConvertablePowerStatic(); + return (one_of(getAlgorithm(), + Algorithm::EltwiseAdd, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseDivide, + Algorithm::EltwisePrelu, + Algorithm::EltwiseMulAdd) && + isBroadcastableToDataInput()) || + isConvertablePowerStatic(); #else // TODO: provide correct list of operations for other backends return false; @@ -1599,11 +1661,11 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { // @todo shifts for Subtract and scales for Divide are replaced with // Add (with opposite sign) and Multiply (with inverse value) for legacy dephwise post ops // This can be avoided after dephwise post ops are gone -std::pair, std::vector> Node::getScalesAndShifts(const Node *parentNode) const { +std::pair, std::vector> Node::getScalesAndShifts(const Node* parentNode) const { std::vector scales, shifts; const auto fillValuesFrom = [&](const NodePtr& constInput, std::vector& buffer) { - auto *constInputNode = dynamic_cast(constInput.get()); + auto* constInputNode = dynamic_cast(constInput.get()); if (!constInputNode) { OPENVINO_THROW("Cannot cast ", constInput->getName(), " to Input"); } @@ -1627,7 +1689,7 @@ std::pair, std::vector> Node::getScalesAndShifts(const fillValuesFrom(getParentEdgeAt(1)->getParent(), scales); fillValuesFrom(getParentEdgeAt(2)->getParent(), shifts); } else if (one_of(getAlgorithm(), Algorithm::EltwisePowerStatic)) { - const auto power = dynamic_cast(this); + const auto power = dynamic_cast(this); if (!power) { OPENVINO_THROW("Cannot cast ", getName(), " to Eltwise"); } @@ -1638,25 +1700,30 @@ std::pair, std::vector> Node::getScalesAndShifts(const } switch (getAlgorithm()) { - case Algorithm::EltwiseAdd: { - scales.resize(shifts.size(), 1.0f); - break; - } - case Algorithm::EltwiseSubtract: { - scales.resize(shifts.size(), 1.0f); - std::transform(shifts.begin(), shifts.end(), shifts.begin(), [](float shift){ return -1.0f * shift; }); - break; - } - case Algorithm::EltwiseMultiply: { - shifts.resize(scales.size(), 0.0f); - break; - } - case Algorithm::EltwiseDivide: { - shifts.resize(scales.size(), 0.0f); - std::transform(scales.begin(), scales.end(), scales.begin(), [](float scale){ return 1.0f / scale; }); - break; - } - default: break; + case Algorithm::EltwiseAdd: { + scales.resize(shifts.size(), 1.0f); + break; + } + case Algorithm::EltwiseSubtract: { + scales.resize(shifts.size(), 1.0f); + std::transform(shifts.begin(), shifts.end(), shifts.begin(), [](float shift) { + return -1.0f * shift; + }); + break; + } + case Algorithm::EltwiseMultiply: { + shifts.resize(scales.size(), 0.0f); + break; + } + case Algorithm::EltwiseDivide: { + shifts.resize(scales.size(), 0.0f); + std::transform(scales.begin(), scales.end(), scales.begin(), [](float scale) { + return 1.0f / scale; + }); + break; + } + default: + break; } return {scales, shifts}; @@ -1673,7 +1740,7 @@ bool Node::isInputTensorAtPortEmpty(size_t port) const { auto edge = getParentEdgeAt(port); if (one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::Validated)) { auto&& mem = edge->getMemory(); - if (mem.isDefined()) { + if (mem.isDefined() && !mem.getDesc().empty()) { return mem.getShape().hasZeroDims(); } } @@ -1823,22 +1890,25 @@ bool Node::canFuseSimpleOperation(const NodePtr& node) const { return ret; } else if (node->getType() == Type::Eltwise) { return DnnlExtensionUtils::isUnarySupportedAsPostOp(node->getAlgorithm()) || - node->canBePerformedAsScaleShift(this); + node->canBePerformedAsScaleShift(this); } return false; } -void Node::addFusedNode(const NodePtr &fusingNode) { +void Node::addFusedNode(const NodePtr& fusingNode) { fusedWith.push_back(fusingNode); } void Node::addSupportedPrimDesc(const std::vector& inPortConfigs, const std::vector& outPortConfigs, impl_desc_type implType) { - auto fill_port = [] (const PortConfigurator& portConfigurator, const Shape& shape, - ov::element::Type prc, std::vector& port) -> bool { - // In order to simplify particular node initialization logic we just don't add config in case target shape is not supported by blockedDescCreator. - // This should be suitable for major of scenarios since almost all nodes add `ncsp` blockedDescCreator which supports any shape rank. + auto fill_port = [](const PortConfigurator& portConfigurator, + const Shape& shape, + ov::element::Type prc, + std::vector& port) -> bool { + // In order to simplify particular node initialization logic we just don't add config in case target shape is + // not supported by blockedDescCreator. This should be suitable for major of scenarios since almost all nodes + // add `ncsp` blockedDescCreator which supports any shape rank. if (shape.getRank() < portConfigurator.blockedDescCreator->getMinimalRank()) return false; @@ -1855,14 +1925,16 @@ void Node::addSupportedPrimDesc(const std::vector& inPortConfi NodeConfig config; for (size_t i = 0; i < inPortConfigs.size(); i++) { auto shape = inPortConfigs[i].shape.getRank() == 0 ? getInputShapeAtPort(i) : inPortConfigs[i].shape; - auto prc = inPortConfigs[i].prc == ov::element::undefined ? getOriginalInputPrecisionAtPort(i) : inPortConfigs[i].prc; + auto prc = + inPortConfigs[i].prc == ov::element::undefined ? getOriginalInputPrecisionAtPort(i) : inPortConfigs[i].prc; if (!fill_port(inPortConfigs[i], shape, prc, config.inConfs)) return; } for (size_t i = 0; i < outPortConfigs.size(); i++) { auto dims = outPortConfigs[i].shape.getRank() == 0 ? getOutputShapeAtPort(i) : outPortConfigs[i].shape; - auto prc = outPortConfigs[i].prc == ov::element::undefined ? getOriginalOutputPrecisionAtPort(i) : outPortConfigs[i].prc; + auto prc = outPortConfigs[i].prc == ov::element::undefined ? getOriginalOutputPrecisionAtPort(i) + : outPortConfigs[i].prc; if (!fill_port(outPortConfigs[i], dims, prc, config.outConfs)) return; } @@ -1883,23 +1955,27 @@ void Node::fuseDQScales(const float* scaleData, const size_t scaleSize) { if (scaleSize > DQScales.size()) DQScales.resize(scaleSize, DQScales[0]); if (1 == scaleSize) { - std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val){ return (scaleData[0] * val); }); - } else { - for (size_t i = 0; i < DQScales.size(); i++) { - DQScales[i] *= scaleData[i]; - } - } - if (std::all_of(DQScales.begin(), DQScales.end(), [OV_CAPTURE_CPY_AND_THIS](float val){ return (val == DQScales[0]);})) + std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) { + return (scaleData[0] * val); + }); + } else { + for (size_t i = 0; i < DQScales.size(); i++) { + DQScales[i] *= scaleData[i]; + } + } + if (std::all_of(DQScales.begin(), DQScales.end(), [OV_CAPTURE_CPY_AND_THIS](float val) { + return (val == DQScales[0]); + })) DQScales.resize(1); } int Node::inPlaceInputPort(int portIdx) const { if (inputShapes.empty()) { - //special case - a dead end node + // special case - a dead end node return -1; } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (!selected_pd) OPENVINO_THROW("Cannot find selected primitive descriptor for node: ", getName()); @@ -1917,11 +1993,11 @@ int Node::inPlaceInputPort(int portIdx) const { int Node::inPlaceOutPort(int portIdx) const { if (outputShapes.empty()) { - //special case - a dead end node + // special case - a dead end node return -1; } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (!selected_pd) OPENVINO_THROW("Cannot find selected primitive descriptor for node: ", getName()); @@ -1938,8 +2014,8 @@ int Node::inPlaceOutPort(int portIdx) const { } void Node::resolveInPlaceDirection() { - enum InplaceDirectionType {UP, DOWN, CYCLIC, NONE}; - enum PortType {INPUT, OUTPUT}; + enum InplaceDirectionType { UP, DOWN, CYCLIC, NONE }; + enum PortType { INPUT, OUTPUT }; auto inPlaceDirection = [](const Node* node, PortType portType, int portNum) -> InplaceDirectionType { if (PortType::INPUT == portType) { @@ -1989,7 +2065,8 @@ void Node::resolveInPlaceDirection() { if (auto pEdge = wEdge.lock()) { auto inpPort = pEdge->getOutputNum(); auto inPlaceInpPort = inPlaceInputPort(inpPort); - if (inPlaceInpPort < 0 || inPlaceDirection(this, PortType::INPUT, inpPort) != InplaceDirectionType::CYCLIC) { + if (inPlaceInpPort < 0 || + inPlaceDirection(this, PortType::INPUT, inpPort) != InplaceDirectionType::CYCLIC) { continue; } // inPlace memory cyclic dependency detected, need to resolve @@ -2001,12 +2078,14 @@ void Node::resolveInPlaceDirection() { config.inConfs[inpPort].inPlace(-1); initDescriptor(config); } else if (parentInPlaceDirection == InplaceDirectionType::DOWN) { - //search if siblings already have downstream direction + // search if siblings already have downstream direction auto downstreamPeers = [&] { for (auto& peerEdge : pParent->getChildEdgesAtPort(pEdge->getInputNum())) { auto peerNode = peerEdge->getChild().get(); - if (peerNode == this) continue; - if (inPlaceDirection(peerNode, PortType::INPUT, peerEdge->getOutputNum()) == InplaceDirectionType::DOWN) { + if (peerNode == this) + continue; + if (inPlaceDirection(peerNode, PortType::INPUT, peerEdge->getOutputNum()) == + InplaceDirectionType::DOWN) { return true; } } @@ -2067,7 +2146,8 @@ void Node::resolveInPlaceDirection() { // note: there are only non-inplace or cyclic-inplace descendants at the moment. std::function searchReferencingOutput; searchReferencingOutput = [&](const Node* node, int portIdx) -> void { - if (numConflicts > 1) return; // early stop + if (numConflicts > 1) + return; // early stop auto childEdges = node->getChildEdgesAtPort(portIdx); for (auto& edge : childEdges) { auto pChild = edge->getChild().get(); @@ -2076,7 +2156,8 @@ void Node::resolveInPlaceDirection() { } else { auto result = inPlaceDirection(pChild, PortType::INPUT, edge->getOutputNum()); if (InplaceDirectionType::CYCLIC == result) { - return searchReferencingOutput(pChild, pChild->inPlaceInputPort(edge->getOutputNum())); + return searchReferencingOutput(pChild, + pChild->inPlaceInputPort(edge->getOutputNum())); } } } @@ -2089,7 +2170,8 @@ void Node::resolveInPlaceDirection() { // note: the parent node does not use inPlace memory at the moment, let's check the siblings for (auto& peerEdge : pParent->getChildEdgesAtPort(pEdge->getInputNum())) { auto peerNode = peerEdge->getChild().get(); - if (peerNode == this) continue; + if (peerNode == this) + continue; if (Type::Output == peerNode->getType()) { numConflicts++; } else { @@ -2101,11 +2183,11 @@ void Node::resolveInPlaceDirection() { } } - if (numConflicts == 1) { // downstream to make the only output edge be referenced. + if (numConflicts == 1) { // downstream to make the only output edge be referenced. auto config = getSelectedPrimitiveDescriptor()->getConfig(); config.outConfs[inPlaceInpPort].inPlace(-1); initDescriptor(config); - } else { // the default direction of upstream + } else { // the default direction of upstream auto config = getSelectedPrimitiveDescriptor()->getConfig(); config.inConfs[inpPort].inPlace(-1); initDescriptor(config); @@ -2120,8 +2202,7 @@ void Node::resolveInPlaceDirection() { #ifndef CPU_DEBUG_CAPS std::ostream& operator<<(std::ostream& out, const Node& node) { - return out << "Node " << node.getName() << - " of type " << node.getTypeStr() << "\n"; + return out << "Node " << node.getName() << " of type " << node.getTypeStr() << "\n"; } std::ostream& operator<<(std::ostream& out, const Node* node) { @@ -2129,5 +2210,5 @@ std::ostream& operator<<(std::ostream& out, const Node* node) { } #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 948bd6999ce27a..9166e87dbf50e1 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -4,37 +4,38 @@ #pragma once +#include + #include +#include #include +#include +#include +#include +#include + #include "cpu_memory.h" #include "cpu_shape.h" #include "cpu_types.h" #include "edge.h" +#include "graph_context.h" #include "memory_desc/cpu_memory_desc.h" -#include "selective_build.h" #include "memory_desc/dnnl_memory_desc.h" +#include "nodes/executors/executor.hpp" +#include "nodes/node_config.h" #include "onednn/dnnl.h" #include "onednn/iml_type_mapper.h" -#include #include "openvino/cc/factory.h" #include "openvino/core/node.hpp" -#include -#include "nodes/node_config.h" -#include #include "perf_count.h" -#include "utils/debug_capabilities.h" +#include "selective_build.h" #include "utils/bit_util.hpp" #include "utils/debug_capabilities.h" -#include "graph_context.h" -#include "nodes/executors/executor.hpp" - -#include -#include -#include - -#define THROW_CPU_NODE_ERR(...) OPENVINO_THROW("[CPU] ", getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) -#define CPU_NODE_ASSERT(condition, ...) OPENVINO_ASSERT(condition, getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) +#define THROW_CPU_NODE_ERR(...) \ + OPENVINO_THROW("[CPU] ", getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) +#define CPU_NODE_ASSERT(condition, ...) \ + OPENVINO_ASSERT(condition, getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) namespace ov { namespace intel_cpu { @@ -45,13 +46,25 @@ using NodeWeakPtr = std::weak_ptr; class PortConfigurator { public: - PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, ov::element::Type prc, const Shape& shape, - bool constant = false, int inPlace = -1) : - blockedDescCreator(getBlockedDescCreator(blockedDescType)), prc(prc), shape(shape), constant(constant), inPlace(inPlace) {} - - PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, ov::element::Type prc = ov::element::undefined, - bool constant = false, int inPlace = -1) : - blockedDescCreator(getBlockedDescCreator(blockedDescType)), prc(prc), constant(constant), inPlace(inPlace) {} + PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, + ov::element::Type prc, + const Shape& shape, + bool constant = false, + int inPlace = -1) + : blockedDescCreator(getBlockedDescCreator(blockedDescType)), + prc(prc), + shape(shape), + constant(constant), + inPlace(inPlace) {} + + PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, + ov::element::Type prc = ov::element::undefined, + bool constant = false, + int inPlace = -1) + : blockedDescCreator(getBlockedDescCreator(blockedDescType)), + prc(prc), + constant(constant), + inPlace(inPlace) {} ov::intel_cpu::BlockedDescCreator::CreatorConstPtr blockedDescCreator; const ov::element::Type prc; @@ -60,7 +73,8 @@ class PortConfigurator { int inPlace = -1; private: - static ov::intel_cpu::BlockedDescCreator::CreatorConstPtr getBlockedDescCreator(ov::intel_cpu::LayoutType blockedDescType) { + static ov::intel_cpu::BlockedDescCreator::CreatorConstPtr getBlockedDescCreator( + ov::intel_cpu::LayoutType blockedDescType) { auto& creators = ov::intel_cpu::BlockedDescCreator::getCommonCreators(); if (creators.find(blockedDescType) == creators.end()) { OPENVINO_THROW("Cannot find tensor descriptor creator"); @@ -71,11 +85,15 @@ class PortConfigurator { class NodeDesc { public: - NodeDesc(NodeConfig conf, impl_desc_type type): - config(std::move(conf)), implementationType(type), executorFactory(nullptr) {} + NodeDesc(NodeConfig conf, impl_desc_type type) + : config(std::move(conf)), + implementationType(type), + executorFactory(nullptr) {} - NodeDesc(NodeConfig conf, impl_desc_type type, ExecutorFactoryLegacyPtr factory): - config(std::move(conf)), implementationType(type), executorFactory(factory) {} + NodeDesc(NodeConfig conf, impl_desc_type type, ExecutorFactoryLegacyPtr factory) + : config(std::move(conf)), + implementationType(type), + executorFactory(factory) {} const NodeConfig& getConfig() const { return config; @@ -98,8 +116,8 @@ class NodeDesc { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> std::shared_ptr getExecutorFactoryAs() { auto casted = std::dynamic_pointer_cast(executorFactory); if (!casted) @@ -119,34 +137,41 @@ class NodeDesc { class Node { public: - Node(const Node &) = delete; - Node & operator = (const Node &) = delete; + Node(const Node&) = delete; + Node& operator=(const Node&) = delete; using AttrPtr = std::shared_ptr; public: - template + template struct Tag {}; struct PerfCounters { PerfCounters(std::string const& name) - : execute(openvino::itt::handle(name)) - , getSupportedDescriptors(openvino::itt::handle>("Node::getSupportedDescriptors")) - , initSupportedPrimitiveDescriptors(openvino::itt::handle>("Node::initSupportedPrimitiveDescriptors")) - , filterSupportedPrimitiveDescriptors(openvino::itt::handle>("Node::filterSupportedPrimitiveDescriptors")) - , selectOptimalPrimitiveDescriptor(openvino::itt::handle>("Node::selectOptimalPrimitiveDescriptor")) - , createPrimitive(openvino::itt::handle>("Node::createPrimitive")) - , initOptimalPrimitiveDescriptor(openvino::itt::handle>("Node::initOptimalPrimitiveDescriptor")) - {} - - template + : execute(openvino::itt::handle(name)), + getSupportedDescriptors(openvino::itt::handle>("Node::getSupportedDescriptors")), + initSupportedPrimitiveDescriptors( + openvino::itt::handle>("Node::initSupportedPrimitiveDescriptors")), + filterSupportedPrimitiveDescriptors( + openvino::itt::handle>("Node::filterSupportedPrimitiveDescriptors")), + selectOptimalPrimitiveDescriptor( + openvino::itt::handle>("Node::selectOptimalPrimitiveDescriptor")), + createPrimitive(openvino::itt::handle>("Node::createPrimitive")), + initOptimalPrimitiveDescriptor( + openvino::itt::handle>("Node::initOptimalPrimitiveDescriptor")) {} + + template void buildClassCounters(const std::string& type_name) { getSupportedDescriptors = openvino::itt::handle>(type_name + "::getSupportedDescriptors"); - initSupportedPrimitiveDescriptors = openvino::itt::handle>(type_name + "::initSupportedPrimitiveDescriptors"); - filterSupportedPrimitiveDescriptors = openvino::itt::handle>(type_name + "::filterSupportedPrimitiveDescriptors"); - selectOptimalPrimitiveDescriptor = openvino::itt::handle>(type_name + "::selectOptimalPrimitiveDescriptor"); + initSupportedPrimitiveDescriptors = + openvino::itt::handle>(type_name + "::initSupportedPrimitiveDescriptors"); + filterSupportedPrimitiveDescriptors = + openvino::itt::handle>(type_name + "::filterSupportedPrimitiveDescriptors"); + selectOptimalPrimitiveDescriptor = + openvino::itt::handle>(type_name + "::selectOptimalPrimitiveDescriptor"); createPrimitive = openvino::itt::handle>(type_name + "::createPrimitive"); - initOptimalPrimitiveDescriptor = openvino::itt::handle>(type_name + "::initOptimalPrimitiveDescriptor"); + initOptimalPrimitiveDescriptor = + openvino::itt::handle>(type_name + "::initOptimalPrimitiveDescriptor"); } openvino::itt::handle_t execute; @@ -159,7 +184,7 @@ class Node { }; class NodesFactory; - static NodesFactory & factory(); + static NodesFactory& factory(); virtual ~Node() = default; @@ -171,11 +196,12 @@ class Node { void remove(); void addParentEdge(const EdgePtr& edge) { - assert(std::none_of(parentEdges.begin(), parentEdges.end(), - [&edge](const EdgeWeakPtr& _edge){ - return _edge.lock()->getOutputNum() == edge->getOutputNum(); - })); - parentEdges.insert(std::upper_bound(parentEdges.begin(), parentEdges.end(), edge, + assert(std::none_of(parentEdges.begin(), parentEdges.end(), [&edge](const EdgeWeakPtr& _edge) { + return _edge.lock()->getOutputNum() == edge->getOutputNum(); + })); + parentEdges.insert(std::upper_bound(parentEdges.begin(), + parentEdges.end(), + edge, [](const EdgeWeakPtr& lhs, const EdgeWeakPtr& rhs) { return lhs.lock()->getOutputNum() < rhs.lock()->getOutputNum(); }), @@ -196,11 +222,11 @@ class Node { removeEdge(edge, childEdges); } - const std::vector &getParentEdges() const noexcept { + const std::vector& getParentEdges() const noexcept { return parentEdges; } - const std::vector &getChildEdges() const noexcept { + const std::vector& getChildEdges() const noexcept { return childEdges; } @@ -238,7 +264,7 @@ class Node { return getSrcMemoryAtPort(idx)->getData(); } - template + template T* getSrcDataAtPortAs(size_t idx) const { return getSrcMemoryAtPort(idx)->getDataAs(); } @@ -247,7 +273,7 @@ class Node { return getDstMemoryAtPort(idx)->getData(); } - template + template T* getDstDataAtPortAs(size_t idx) const { return getDstMemoryAtPort(idx)->getDataAs(); } @@ -273,7 +299,8 @@ class Node { enum class ConstantType { Const, // Node is placed in a constant subgraph NoConst, // Node is placed in a non-constant subgraph - StrictNoConst, // Node produces non-constant subgraph: this type can't be changed and it does not depend on the parent nodes' ConstantType. + StrictNoConst, // Node produces non-constant subgraph: this type can't be changed and it does not depend on the + // parent nodes' ConstantType. }; ConstantType getConstantType() const; void updateConstantType(); @@ -290,10 +317,11 @@ class Node { bool isFusedWith(Type type) const; - virtual void addFusedNode(const NodePtr &fusingNode); + virtual void addFusedNode(const NodePtr& fusingNode); virtual void fuseInto(NodePtr& parentNode) { - // The graph supports fusing only of consecutive nodes and some graph logic requires to know through which input port a node was fused into parent one. + // The graph supports fusing only of consecutive nodes and some graph logic requires to know through which input + // port a node was fused into parent one. for (size_t i = 0; i < getParentEdges().size(); i++) { if (getParentEdgeAt(i)->getParent().get() == parentNode.get()) { setFusingPort(i); @@ -323,15 +351,15 @@ class Node { fusedWith.clear(); } - void mergeWith(const NodePtr &merge) { + void mergeWith(const NodePtr& merge) { mergedWith.push_back(merge); } - const std::vector &getMergeWith() { + const std::vector& getMergeWith() { return mergedWith; } - const std::vector &getFusedWith() { + const std::vector& getFusedWith() { return fusedWith; } @@ -343,17 +371,17 @@ class Node { this->fusingPort = fusingPort; } - const std::string &getName() const { + const std::string& getName() const { return name; } void addOriginalLayer(const std::string& layerName); - const std::string &getOriginalLayers() const { + const std::string& getOriginalLayers() const { return originalLayers; } - const std::string &getParallelDomain() const { + const std::string& getParallelDomain() const { return parallelDomain; } @@ -437,7 +465,9 @@ class Node { virtual std::string getPrimitiveDescriptorType() const; - PerfCount &PerfCounter() { return perfCounter; } + PerfCount& PerfCounter() { + return perfCounter; + } virtual void resolveInPlaceEdges(Edge::LOOK look = Edge::LOOK_BOTH); @@ -448,7 +478,7 @@ class Node { void updateShapes(); void updateDynamicParams(); void executeDynamic(dnnl::stream strm, int numaId = -1); - virtual void redefineOutputMemory(const std::vector &newShapes); + virtual void redefineOutputMemory(const std::vector& newShapes); void redefineOutputMemory(const size_t port, const VectorDims& new_output_shape); bool outputShapeDataDependency() const; @@ -475,7 +505,8 @@ class Node { /** * @brief Performs Node initialization based on graph context. - * This is an auxiliary method that allows to use information not available in Node constructor (e.g. connection information with other nodes) + * This is an auxiliary method that allows to use information not available in Node constructor (e.g. connection + * information with other nodes) */ virtual void init() {} @@ -483,11 +514,11 @@ class Node { return execIndex; } - const std::string & getTypeStr() const { + const std::string& getTypeStr() const { return typeStr; } - void setTypeStr(const std::string &typeStr) { + void setTypeStr(const std::string& typeStr) { this->typeStr = typeStr; } @@ -499,11 +530,11 @@ class Node { return 1; } - const PerfCounters & perfCounters() const { + const PerfCounters& perfCounters() const { return profiling; } - PerfCounters & perfCounters() { + PerfCounters& perfCounters() { return profiling; } @@ -588,7 +619,7 @@ class Node { return false; } - bool canBePerformedAsScaleShift(const Node *parentNode = nullptr) const; + bool canBePerformedAsScaleShift(const Node* parentNode = nullptr) const; bool isDynamicNode() const { return isDynamic; @@ -613,14 +644,14 @@ class Node { } /** - * @brief Return scales and shift if nodes can be executed as ScaleShift, else raise exception - * If node has only scale or shift value, fill missing value with default values - * i.e. EltwiseAdd: fill shifts from constant, fill scales with default values = 1.0f - * @param parentNode - * node from which data comes - * @return pair of scales and shifts - */ - std::pair, std::vector> getScalesAndShifts(const Node *parentNode) const; + * @brief Return scales and shift if nodes can be executed as ScaleShift, else raise exception + * If node has only scale or shift value, fill missing value with default values + * i.e. EltwiseAdd: fill shifts from constant, fill scales with default values = 1.0f + * @param parentNode + * node from which data comes + * @return pair of scales and shifts + */ + std::pair, std::vector> getScalesAndShifts(const Node* parentNode) const; void fuseDQScales(const float* scaleData, const size_t scaleSize); const std::vector& getDQScales() const { @@ -631,8 +662,14 @@ class Node { * Seed node should call this routine and pass its post operations list as parameter. * @param ops List of fused post operations */ - virtual void appendPostOps(dnnl::post_ops& ops, const VectorDims& postOpDims, std::unordered_map& postOpsMem, const int channelAxis = 1); - virtual void appendPostOps(dnnl::post_ops& ops, const VectorDims& postOpDims, std::vector& postOpsMem, const int channelAxis = 1); + virtual void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis = 1); + virtual void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1); virtual bool canBeExecutedInInt8() const { OPENVINO_THROW_NOT_IMPLEMENTED("canBeExecutedInInt8 not implemented for node with type ", NameFromType(getType())); @@ -649,22 +686,24 @@ class Node { this->type = type; } - virtual PortDescBasePtr getConsistentInputDesc(const NodeConfig &config, size_t idx) const; - virtual PortDescBasePtr getConsistentOutputDesc(const NodeConfig &config, size_t idx) const; - virtual MemoryDescPtr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const; - virtual MemoryDescPtr getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const; + virtual PortDescBasePtr getConsistentInputDesc(const NodeConfig& config, size_t idx) const; + virtual PortDescBasePtr getConsistentOutputDesc(const NodeConfig& config, size_t idx) const; + virtual MemoryDescPtr getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const; + virtual MemoryDescPtr getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const; - virtual AttrPtr initPrimitiveAttr() { return nullptr; } + virtual AttrPtr initPrimitiveAttr() { + return nullptr; + } - typedef std::function - GetPrimitiveMemoryFormatFunc; + typedef std::function + GetPrimitiveMemoryFormatFunc; std::vector internalBlobDesc; std::vector inputShapes; std::vector outputShapes; - std::vector fusedWith; - std::vector mergedWith; + std::vector fusedWith; + std::vector mergedWith; int curNumaNode = -1; @@ -672,11 +711,11 @@ class Node { virtual void toNumaNodeImpl(int numaID); std::string primitivesPriority; - std::vector customImplPriorities; - std::vector inputMemoryFormatsFilter; - std::vector outputMemoryFormatsFilter; + std::vector customImplPriorities; + std::vector inputMemoryFormatsFilter; + std::vector outputMemoryFormatsFilter; bool enforceBF16evenForGraphTail = false; - bool keepOriginalPrecision = false; + bool keepOriginalPrecision = false; std::string originalLayers; // contains names of the original layers separated by comma std::string parallelDomain; @@ -692,11 +731,7 @@ class Node { int selectedPrimitiveDescriptorIndex = -1; - enum class InPlaceType { - Unknown, - InPlace, - NoInPlace - }; + enum class InPlaceType { Unknown, InPlace, NoInPlace }; mutable InPlaceType inplace = InPlaceType::Unknown; ConstantType constant = ConstantType::NoConst; std::vector internalBlobs; @@ -718,7 +753,7 @@ class Node { void selectPreferPrimitiveDescriptorWithShape(const std::vector& priority, bool ignoreConstInputs); bool isOneDimShape(const ov::PartialShape& pshape); bool isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2); - bool isConfigDefined(const NodeConfig &config) const; + bool isConfigDefined(const NodeConfig& config) const; virtual bool canBeInPlace() const; /* returns default implementaion prioirity */ @@ -733,13 +768,15 @@ class Node { /** * @brief Auxiliary function to get node input precisions - * @return Vector of precisions based on information from node input edges. Return empty vector in case edges are not initialized yet. + * @return Vector of precisions based on information from node input edges. Return empty vector in case edges are + * not initialized yet. */ virtual std::vector getInputPrecisions() const; /** * @brief Auxiliary function to get node output precisions - * @return Vector of precisions based on information from node output edges. Return empty vector in case edges are not initialized yet. + * @return Vector of precisions based on information from node output edges. Return empty vector in case edges are + * not initialized yet. */ virtual std::vector getOutputPrecisions() const; @@ -788,7 +825,7 @@ class Node { MemoryPtr getScratchPadMem(const MemoryDescPtr& desc) { if (!scratchpadMem || !scratchpadMem->getDesc().isCompatible(*desc)) { - scratchpadMem = context->getScratchPad(curNumaNode)->createScratchPadMem(desc); + scratchpadMem = context->getScratchPad()->createScratchPadMem(desc); } return scratchpadMem; } @@ -803,13 +840,14 @@ class Node { // is still under control of strong references outside of cache. // privateWeightCache is for holding strong references to constant weight // copies of same content with different layouts. - std::shared_ptr> privateWeightCache - = std::make_shared>(); + std::shared_ptr> privateWeightCache = + std::make_shared>(); private: - static void removeEdge(const EdgePtr edge, std::vector &edges) { - edges.erase(std::remove_if(edges.begin(), edges.end(), - [&edge] (EdgeWeakPtr _edge) { + static void removeEdge(const EdgePtr edge, std::vector& edges) { + edges.erase(std::remove_if(edges.begin(), + edges.end(), + [&edge](EdgeWeakPtr _edge) { return _edge.lock() == edge; }), edges.end()); @@ -856,22 +894,20 @@ constexpr uint64_t PortMask(T... rest) { return util::bit::mask(rest...); } -class Node::NodesFactory : public openvino::cc::Factory& op, - const GraphContext::CPtr)> { +class Node::NodesFactory + : public openvino::cc::Factory& op, const GraphContext::CPtr)> { public: NodesFactory(); Node* create(const std::shared_ptr& op, const GraphContext::CPtr context); }; -template +template struct NodeImpl : public NodeType { - NodeImpl(const std::shared_ptr& op, const GraphContext::CPtr context) - : NodeType(op, context) { + NodeImpl(const std::shared_ptr& op, const GraphContext::CPtr context) : NodeType(op, context) { NodeType::perfCounters().template buildClassCounters(NameFromType(NodeType::getType())); } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp index f4e7f6217a8dec..f4af11b0f2362a 100644 --- a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp @@ -3,18 +3,21 @@ // #include "adaptive_pooling.h" -#include "openvino/core/parallel.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" + #include -#include "onednn/dnnl.h" -#include "dnnl_extension_utils.h" -#include "selective_build.h" + #include #include #include -#include "utils/general_utils.h" #include + +#include "cpu/x64/cpu_isa_traits.hpp" +#include "dnnl_extension_utils.h" +#include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" +#include "selective_build.h" #include "shape_inference/custom/adaptive_pooling.hpp" +#include "utils/general_utils.h" using namespace dnnl; using namespace dnnl::impl::cpu::x64; @@ -23,7 +26,8 @@ namespace ov { namespace intel_cpu { namespace node { -bool AdaptivePooling::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool AdaptivePooling::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (one_of(op->get_type_info(), ov::op::v8::AdaptiveAvgPool::get_type_info_static())) { auto adaPool = std::dynamic_pointer_cast(op); @@ -51,9 +55,9 @@ AdaptivePooling::AdaptivePooling(const std::shared_ptr& op, const Grap : Node(op, context, AdaptivePoolingShapeInferFactory(op)) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { - errorPrefix = "Adaptive Pooling layer with name '" + getName() + "' "; + errorPrefix = "Adaptive Pooling layer with name '" + getName() + "' "; } else { - OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } if (one_of(op->get_type_info(), ov::op::v8::AdaptiveAvgPool::get_type_info_static())) { algorithm = Algorithm::AdaptivePoolingAvg; @@ -104,14 +108,14 @@ void AdaptivePooling::initSupportedPrimitiveDescriptors() { // we supports only fp32 currently precision = ov::element::f32; - std::vector dataFormats{ LayoutType::ncsp }; - const auto &inDims = getInputShapeAtPort(0).getDims(); + std::vector dataFormats{LayoutType::ncsp}; + const auto& inDims = getInputShapeAtPort(0).getDims(); if (inDims[1] != Shape::UNDEFINED_DIM && inDims[1] != 1) { dataFormats.push_back(LayoutType::nspc); dataFormats.push_back(LayoutType::nCsp16c); dataFormats.push_back(LayoutType::nCsp8c); } - for (const auto &df : dataFormats) { + for (const auto& df : dataFormats) { if (algorithm == Algorithm::AdaptivePoolingAvg) { addSupportedPrimDesc({{df, precision}, {LayoutType::ncsp, ov::element::i32}}, {{df, precision}}, @@ -134,9 +138,9 @@ void AdaptivePooling::execute(dnnl::stream strm) { if (!(inputPrec == dnnl_f32 && outputPrec == dnnl_f32)) OPENVINO_THROW(errorPrefix, "doesn't support demanded precisions"); - auto &srcMemory0 = getParentEdgeAt(0)->getMemory(); - auto &srcMemory1 = getParentEdgeAt(1)->getMemory(); - int *indexDst = nullptr; + auto& srcMemory0 = getParentEdgeAt(0)->getMemory(); + auto& srcMemory1 = getParentEdgeAt(1)->getMemory(); + int* indexDst = nullptr; if (algorithm == Algorithm::AdaptivePoolingMax) { indexDst = getDstDataAtPortAs(1); @@ -144,14 +148,15 @@ void AdaptivePooling::execute(dnnl::stream strm) { auto isPlainFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::ncsp); auto isTailCFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::nspc); - auto isBlkFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp16c) || srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp8c); + auto isBlkFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp16c) || + srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp8c); auto srcBlockDesc = srcMemory0.getDescWithType(); int blockSize = isBlkFmt ? srcBlockDesc->getBlockDims().back() : 1; - const auto *src = getSrcDataAtPortAs(0); - const auto *srcPooledSpatialShapes = getSrcDataAtPortAs(1); - auto *dst = getDstDataAtPortAs(0); + const auto* src = getSrcDataAtPortAs(0); + const auto* srcPooledSpatialShapes = getSrcDataAtPortAs(1); + auto* dst = getDstDataAtPortAs(0); if (static_cast(srcMemory1.getShape().getElementsCount()) != spatialDimsCount) OPENVINO_THROW(errorPrefix, @@ -175,8 +180,9 @@ void AdaptivePooling::execute(dnnl::stream strm) { const int iHW = IH * IW; const int oDHW = OD * OH * OW, oHW = OH * OW; - const int chPadding = blockSize * (isBlkFmt ? srcBlockDesc->getBlockDims()[1] : srcMemory0.getShape().getStaticDims()[1]); - const int blockCount = (isTailCFmt ? 1 : chPadding / blockSize); + const int chPadding = + blockSize * (isBlkFmt ? srcBlockDesc->getBlockDims()[1] : srcMemory0.getShape().getStaticDims()[1]); + const int blockCount = (isTailCFmt ? 1 : chPadding / blockSize); auto selectedPrimitiveDescriptor = getSelectedPrimitiveDescriptor(); if (!selectedPrimitiveDescriptor) OPENVINO_THROW(errorPrefix, "doesn't have primitive descriptors."); @@ -186,27 +192,26 @@ void AdaptivePooling::execute(dnnl::stream strm) { // unified strides array const size_t tailDimsOffset = (isTailCFmt ? -1 : 0); - const size_t inStrides[5] = { - srcStrides[0], - (isTailCFmt ? 1 : srcStrides[1]), - (spatialDimsCount == 3 ? srcStrides[2 + tailDimsOffset] : 0), - (spatialDimsCount >= 2 ? srcStrides[spatialDimsCount + tailDimsOffset] : 0), - srcStrides[spatialDimsCount + 1 + tailDimsOffset] }; - const size_t outStrides[5] = { - dstStrides[0], - (isTailCFmt ? 1 : dstStrides[1]), - (spatialDimsCount == 3 ? dstStrides[2 + tailDimsOffset] : 0), - (spatialDimsCount >= 2 ? dstStrides[spatialDimsCount + tailDimsOffset] : 0), - dstStrides[spatialDimsCount + 1 + tailDimsOffset] }; - - std::function pool; - auto poolMax = [&] (const float *srcData, float *dstData, int od, int oh, int ow, size_t spatIndOff) { + const size_t inStrides[5] = {srcStrides[0], + (isTailCFmt ? 1 : srcStrides[1]), + (spatialDimsCount == 3 ? srcStrides[2 + tailDimsOffset] : 0), + (spatialDimsCount >= 2 ? srcStrides[spatialDimsCount + tailDimsOffset] : 0), + srcStrides[spatialDimsCount + 1 + tailDimsOffset]}; + const size_t outStrides[5] = {dstStrides[0], + (isTailCFmt ? 1 : dstStrides[1]), + (spatialDimsCount == 3 ? dstStrides[2 + tailDimsOffset] : 0), + (spatialDimsCount >= 2 ? dstStrides[spatialDimsCount + tailDimsOffset] : 0), + dstStrides[spatialDimsCount + 1 + tailDimsOffset]}; + + std::function pool; + auto poolMax = [&](const float* srcData, float* dstData, int od, int oh, int ow, size_t spatIndOff) { size_t dStart, dEnd, hStart, hEnd, wStart, wEnd; setBinBorders(&dStart, &dEnd, od, ID, OD); setBinBorders(&hStart, &hEnd, oh, IH, OH); setBinBorders(&wStart, &wEnd, ow, IW, OW); - float res = srcData[dStart * inStrides[2] + hStart * inStrides[3] + wStart * inStrides[4]]; // initial max value - int resIndex = dStart * iHW + hStart * IW + wStart; // initial max index + float res = + srcData[dStart * inStrides[2] + hStart * inStrides[3] + wStart * inStrides[4]]; // initial max value + int resIndex = dStart * iHW + hStart * IW + wStart; // initial max index for (size_t pixD = dStart; pixD < dEnd; pixD++) { for (size_t pixH = hStart; pixH < hEnd; pixH++) { for (size_t pixW = wStart; pixW < wEnd; pixW++) { @@ -219,7 +224,7 @@ void AdaptivePooling::execute(dnnl::stream strm) { *dstData = res; indexDst[spatIndOff * oDHW + od * oHW + oh * OW + ow] = resIndex; }; - auto poolAvg = [&] (const float *srcData, float *dstData, int od, int oh, int ow, size_t spatIndOff) { + auto poolAvg = [&](const float* srcData, float* dstData, int od, int oh, int ow, size_t spatIndOff) { size_t dStart, dEnd, hStart, hEnd, wStart, wEnd; setBinBorders(&dStart, &dEnd, od, ID, OD); setBinBorders(&hStart, &hEnd, oh, IH, OH); @@ -245,11 +250,10 @@ void AdaptivePooling::execute(dnnl::stream strm) { pool = poolAvg; } - parallel_for5d(N, blockCount, OD, OH, OW, - [&](int n, int blkIdx, int od, int oh, int ow) { + parallel_for5d(N, blockCount, OD, OH, OW, [&](int n, int blkIdx, int od, int oh, int ow) { auto srcData = src + n * inStrides[0] + blkIdx * inStrides[1]; - auto dstData = dst + n * outStrides[0] + blkIdx * outStrides[1] + - od * outStrides[2] + oh * outStrides[3] + ow * outStrides[4]; + auto dstData = dst + n * outStrides[0] + blkIdx * outStrides[1] + od * outStrides[2] + oh * outStrides[3] + + ow * outStrides[4]; int cStart = 0, cEnd = C, inResidual = 0, outResidual = 0; if (!isTailCFmt) { cStart = blkIdx * blockSize; @@ -263,18 +267,23 @@ void AdaptivePooling::execute(dnnl::stream strm) { inResidual = outResidual = c % blockSize; } pool(srcData + inResidual, dstData + outResidual, od, oh, ow, n * C + c); - }}); + } + }); } bool AdaptivePooling::created() const { return getType() == Type::AdaptivePooling; } -inline void AdaptivePooling::setBinBorders(size_t *startPtr, size_t *endPtr, size_t idx, size_t inputLength, size_t outputLength) { +inline void AdaptivePooling::setBinBorders(size_t* startPtr, + size_t* endPtr, + size_t idx, + size_t inputLength, + size_t outputLength) { *(startPtr) = idx * inputLength / outputLength; *(endPtr) = ceil(static_cast((idx + 1) * inputLength) / outputLength); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h index c88c9b5989aef9..04b628a5da5cee 100644 --- a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h +++ b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h @@ -5,9 +5,11 @@ #pragma once #include -#include + #include +#include #include + #include "dnnl_extension_utils.h" namespace ov { @@ -29,16 +31,18 @@ class AdaptivePooling : public Node { int spatialDimsCount; mutable std::vector spatialDimsValue = {}; ov::element::Type precision = ov::element::f32; - inline void setBinBorders(size_t *startPtr, size_t *endPtr, size_t idx, size_t inputLength, size_t outputLength); + inline void setBinBorders(size_t* startPtr, size_t* endPtr, size_t idx, size_t inputLength, size_t outputLength); std::string errorPrefix; protected: bool needShapeInfer() const override; - bool needPrepareParams() const override { return false; }; + bool needPrepareParams() const override { + return false; + }; void executeDynamicImpl(dnnl::stream strm) override; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp b/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp index 80713e90750e2d..50665c083ec930 100644 --- a/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp +++ b/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp @@ -2,14 +2,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "batch_to_space.h" + +#include #include +#include + #include "dnnl_types.h" +#include "nodes/common/blocked_desc_creator.h" #include "openvino/core/parallel.hpp" #include "selective_build.h" -#include "batch_to_space.h" -#include "nodes/common/blocked_desc_creator.h" -#include namespace ov { namespace intel_cpu { @@ -40,8 +42,8 @@ BatchToSpace::BatchToSpace(const std::shared_ptr& op, const GraphConte if (inputShapes.size() != 4 || outputShapes.size() != 1) OPENVINO_THROW(errorPrefix, " has incorrect number of input or output edges!"); - const auto &inDims = getInputShapeAtPort(0).getDims(); - const auto &outDims = getOutputShapeAtPort(0).getDims(); + const auto& inDims = getInputShapeAtPort(0).getDims(); + const auto& outDims = getOutputShapeAtPort(0).getDims(); if (inDims.size() < 4 || inDims.size() > 5) OPENVINO_THROW(errorPrefix, " has unsupported 'data' input rank: ", inDims.size()); if (inDims.size() != outDims.size()) @@ -52,7 +54,7 @@ void BatchToSpace::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const auto &inDims = getInputShapeAtPort(0).getDims(); + const auto& inDims = getInputShapeAtPort(0).getDims(); const auto precision = getOriginalInputPrecisionAtPort(0); const std::set supported_precision_sizes = {1, 2, 4, 8}; if (supported_precision_sizes.find(precision.size()) == supported_precision_sizes.end()) @@ -88,7 +90,7 @@ void BatchToSpace::initSupportedPrimitiveDescriptors() { } } -static std::vector getShape5D(const VectorDims &shape) { +static std::vector getShape5D(const VectorDims& shape) { std::vector shape5D(5, 1); for (int i = 0; i < 2; i++) { shape5D[i] = shape[i]; @@ -98,26 +100,26 @@ static std::vector getShape5D(const VectorDims &shape) { return shape5D; } -template +template void BatchToSpace::batchToSpaceKernel() { - const auto *srcData = getSrcDataAtPortAs(0); - const auto *blockShapesPtr = getSrcDataAtPortAs(1); + const auto* srcData = getSrcDataAtPortAs(0); + const auto* blockShapesPtr = getSrcDataAtPortAs(1); size_t dataRank = getSrcMemoryAtPort(0)->getShape().getRank(); blockShapeIn.clear(); for (size_t i = 0; i < dataRank; i++) { blockShapeIn.push_back(*(blockShapesPtr + i)); } - const auto *padsBeginPtr = getSrcDataAtPortAs(2); + const auto* padsBeginPtr = getSrcDataAtPortAs(2); cropsBeginIn.clear(); for (size_t i = 0; i < dataRank; i++) { cropsBeginIn.push_back(*(padsBeginPtr + i)); } - auto *dstData = getDstDataAtPortAs(0); + auto* dstData = getDstDataAtPortAs(0); - const auto &inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); - const auto &outDims = getChildEdgeAt(0)->getMemory().getStaticDims(); + const auto& inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); + const auto& outDims = getChildEdgeAt(0)->getMemory().getStaticDims(); auto srcDesc = getParentEdgeAt(0)->getMemory().getDescWithType(); @@ -193,8 +195,8 @@ void BatchToSpace::batchToSpaceKernel() { const int64_t addTmpOC = blocked ? 0lu : oAdd[1]; const int64_t addTmpOc = blocked ? oAdd[1] : 0lu; - const size_t firstI1 = i0 == 0 ? std::max(begin[1], indxStart[1]) : begin[1]; - const size_t lastI1 = i0 == indxEnd[0] ? std::min(indxEnd[1] + 1, finish[1]) : finish[1]; + const size_t firstI1 = i0 == 0 ? std::max(begin[1], indxStart[1]) : begin[1]; + const size_t lastI1 = i0 == indxEnd[0] ? std::min(indxEnd[1] + 1, finish[1]) : finish[1]; for (size_t i1 = firstI1; i1 < lastI1; ++i1) { const size_t block = i1 == finish[1] ? lastBlock : blockSize; @@ -216,12 +218,13 @@ void BatchToSpace::batchToSpaceKernel() { const size_t dstIdx4 = dstIdx3 + tmpOw * blockSize; for (size_t it = 0; it < itEnd + 1; ++it) { const size_t i5Begin = it == 0 ? 0 : (it * blockSize - 1 - oAdd[1]) / blockShape[1] + 1; - const size_t i5End = it == itEnd ? (block - 1) : ((it + 1) * blockSize - 1 - oAdd[1]) / blockShape[1]; + const size_t i5End = + it == itEnd ? (block - 1) : ((it + 1) * blockSize - 1 - oAdd[1]) / blockShape[1]; for (size_t i5 = i5Begin; i5 < i5End + 1; ++i5) { const int64_t tmpOc = i5 * blockShape[1] + addTmpOc; const size_t srcIdx5 = srcIdx4 + i5; const size_t dstIdx5 = - dstIdx4 + it * outSpatialStep * blockSize + (tmpOc - it * blockSize); + dstIdx4 + it * outSpatialStep * blockSize + (tmpOc - it * blockSize); dstData[dstIdx5] = srcData[srcIdx5]; } } @@ -239,13 +242,19 @@ void BatchToSpace::executeDynamicImpl(dnnl::stream strm) { void BatchToSpace::execute(dnnl::stream strm) { switch (getParentEdgeAt(0)->getMemory().getDesc().getPrecision().size()) { - case 1: batchToSpaceKernel::value_type>(); break; - case 2: batchToSpaceKernel::value_type>(); break; - case 4: batchToSpaceKernel::value_type>(); break; - default: - OPENVINO_THROW("BatchToSpace layer does not support precision '", - std::string(getParentEdgeAt(0)->getMemory().getDesc().getPrecision().get_type_name()), - "'"); + case 1: + batchToSpaceKernel::value_type>(); + break; + case 2: + batchToSpaceKernel::value_type>(); + break; + case 4: + batchToSpaceKernel::value_type>(); + break; + default: + OPENVINO_THROW("BatchToSpace layer does not support precision '", + std::string(getParentEdgeAt(0)->getMemory().getDesc().getPrecision().get_type_name()), + "'"); } } @@ -253,6 +262,6 @@ bool BatchToSpace::created() const { return getType() == Type::BatchToSpace; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/batch_to_space.h b/src/plugins/intel_cpu/src/nodes/batch_to_space.h index 1b583f74bd7905..5211e0c0b5dd10 100644 --- a/src/plugins/intel_cpu/src/nodes/batch_to_space.h +++ b/src/plugins/intel_cpu/src/nodes/batch_to_space.h @@ -14,7 +14,7 @@ class BatchToSpace : public Node { public: BatchToSpace(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; // output shape can potentially be empty @@ -25,14 +25,18 @@ class BatchToSpace : public Node { void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override { return false; }; - bool needShapeInfer() const override {return true;}; + bool needPrepareParams() const override { + return false; + }; + bool needShapeInfer() const override { + return true; + }; void executeDynamicImpl(dnnl::stream strm) override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - template + template void batchToSpaceKernel(); private: @@ -42,6 +46,6 @@ class BatchToSpace : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp index d1e82235ba9bb1..336a370374a9f9 100644 --- a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp @@ -3,34 +3,35 @@ // #include "bin_conv.h" -#include "eltwise.h" -#include "fake_quantize.h" -#include "conv.h" + #include #include #include -#include "dnnl_types.h" + +#include "conv.h" +#include "cpu/x64/cpu_isa_traits.hpp" +#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp" +#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" +#include "cpu/x64/jit_generator.hpp" #include "dnnl_extension_utils.h" +#include "dnnl_types.h" +#include "eltwise.h" +#include "fake_quantize.h" #include "openvino/core/parallel.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" -#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" -#include "utils/general_utils.h" #include "openvino/opsets/opset1.hpp" +#include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" // WA for xbyak.h #ifdef _WIN32 -# ifndef _WINSOCKAPI_ -# define _WINSOCKAPI_ -# endif -# ifndef _WINSOCK2API_ -# define _WINSOCK2API_ -# endif +# ifndef _WINSOCKAPI_ +# define _WINSOCKAPI_ +# endif +# ifndef _WINSOCK2API_ +# define _WINSOCK2API_ +# endif #endif - using namespace dnnl; using namespace dnnl::impl; using namespace dnnl::impl::cpu; @@ -42,14 +43,17 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_bin_conv_call_args, field) +# define GET_OFF(field) offsetof(jit_bin_conv_call_args, field) template struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_bin_conv_kernel_f32) - explicit jit_uni_bin_conv_kernel_f32(jit_bin_conv_params jcp, jit_dw_conv_params jcp_dw_conv, const dnnl_primitive_attr &attr) : - jit_uni_bin_conv_kernel(jcp, jcp_dw_conv, attr), jit_generator(jit_name()) {} + explicit jit_uni_bin_conv_kernel_f32(jit_bin_conv_params jcp, + jit_dw_conv_params jcp_dw_conv, + const dnnl_primitive_attr& attr) + : jit_uni_bin_conv_kernel(jcp, jcp_dw_conv, attr), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -57,16 +61,19 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } void generate() override { - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; int end_idx = jcp_.with_dw_conv ? p.find(primitive_kind::convolution) : p.len(); for (int i = 0; i < end_idx; i++) { - auto &post_op = p.entry_[i]; + auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { - eltwise_injectors.push_back(std::make_shared>( - this, post_op.eltwise, true, eltwise_reserved, mask_post_op_reserved)); + eltwise_injectors.push_back(std::make_shared>(this, + post_op.eltwise, + true, + eltwise_reserved, + mask_post_op_reserved)); } else if (post_op.is_depthwise()) { - depthwise_injectors.push_back(std::make_shared>( - this, post_op, mask_post_op_reserved)); + depthwise_injectors.push_back( + std::make_shared>(this, post_op, mask_post_op_reserved)); } } @@ -80,7 +87,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ mov(reg_oc_work, ptr[this->param1 + GET_OFF(oc_work)]); mov(reg_post_ops_data, ptr[this->param1 + GET_OFF(post_op_data)]); - mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); + mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); mov(reg_table, l_table); Label main_loop_label; @@ -98,14 +105,16 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ int nbits = 8; - L(main_loop_label); { + L(main_loop_label); + { cmp(reg_oc_work, jcp_.oc_block); jl(tail_label, T_NEAR); solve_common(1, jcp_.oc_block); sub(reg_oc_work, jcp_.oc_block); - add(reg_kernel_base, jcp_.oc_block * jcp_.nb_ic * jcp_.kh * jcp_.kw * div_up(jcp_.ic_block, nbits) * jcp_.typesize_in); + add(reg_kernel_base, + jcp_.oc_block * jcp_.nb_ic * jcp_.kh * jcp_.kw * div_up(jcp_.ic_block, nbits) * jcp_.typesize_in); if (jcp_.with_dw_conv) { add(reg_output_base, jcp_.oc_block * jcp_dw_conv_.kh * jcp_.ow * jcp_.typesize_out); @@ -137,8 +146,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } private: - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; using Ymm = const Xbyak::Ymm; using reg8_t = const Xbyak::Reg8; @@ -212,100 +220,108 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ nstl::vector>> eltwise_injectors; nstl::vector>> depthwise_injectors; - void cvt2ps(dnnl::memory::data_type type_in, Vmm vmm_in, const Xbyak::Operand &op, bool scalar_load) { + void cvt2ps(dnnl::memory::data_type type_in, Vmm vmm_in, const Xbyak::Operand& op, bool scalar_load) { Xmm xmm_in = Xmm(vmm_in.getIdx()); switch (type_in) { - case memory::data_type::f32: - case memory::data_type::s32: - if (scalar_load) { - mov(reg_tmp_32, op); - uni_vmovq(xmm_in, reg_tmp_64); - } else { - uni_vmovups(vmm_in, op); - } - break; - case memory::data_type::s8: - if (scalar_load) { - movsx(reg_tmp_32, op); - uni_vmovq(xmm_in, reg_tmp_64); - } else { - uni_vpmovsxbd(vmm_in, op); - } - break; - case memory::data_type::u8: - if (scalar_load) { - movzx(reg_tmp_32, op); - uni_vmovq(xmm_in, reg_tmp_64); - } else { - uni_vpmovzxbd(vmm_in, op); - } - break; - default: assert(!"unsupported data type"); + case memory::data_type::f32: + case memory::data_type::s32: + if (scalar_load) { + mov(reg_tmp_32, op); + uni_vmovq(xmm_in, reg_tmp_64); + } else { + uni_vmovups(vmm_in, op); + } + break; + case memory::data_type::s8: + if (scalar_load) { + movsx(reg_tmp_32, op); + uni_vmovq(xmm_in, reg_tmp_64); + } else { + uni_vpmovsxbd(vmm_in, op); + } + break; + case memory::data_type::u8: + if (scalar_load) { + movzx(reg_tmp_32, op); + uni_vmovq(xmm_in, reg_tmp_64); + } else { + uni_vpmovzxbd(vmm_in, op); + } + break; + default: + assert(!"unsupported data type"); } if (type_in != data_type::f32) uni_vcvtdq2ps(vmm_in, vmm_in); } - void store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) { + void store_dst(const Xbyak::Address& op, Vmm vmm_dst, bool scalar_store) { Ymm ymm_dst = Ymm(vmm_dst.getIdx()); Xmm xmm_dst = Xmm(vmm_dst.getIdx()); switch (jcp_.dst_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - if (scalar_store) { - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_32); - } else { - uni_vmovups(op, vmm_dst); - } - break; - case memory::data_type::s8: - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + case memory::data_type::f32: + case memory::data_type::s32: + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_32); + } else { + uni_vmovups(op, vmm_dst); + } + break; + case memory::data_type::s8: + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41 && !scalar_store) - vpermq(ymm_dst, ymm_dst, 0x08); + if (isa != x64::sse41 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - if (scalar_store) { - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - } else { - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); - } - break; - case memory::data_type::u8: - case memory::data_type::bin: - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case memory::data_type::u8: + case memory::data_type::bin: + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41 && !scalar_store) - vpermq(ymm_dst, ymm_dst, 0x08); + if (isa != x64::sse41 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - if (scalar_store) { - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - } else { - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); - } + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } - break; - default: - assert(!"unknown dst_dt"); + break; + default: + assert(!"unknown dst_dt"); } } - void apply_filter(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step, int ic_blocks, bool last_icb, bool h_padded) { + void apply_filter(int ur_w, + int pad_l, + int pad_r, + int oc_blocks, + int oc_step, + int ic_blocks, + bool last_icb, + bool h_padded) { int kw = jcp_.kw; int kh = jcp_.kh; int stride_w = jcp_.stride_w; @@ -318,15 +334,16 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int ki = 0; ki < kw; ki++) { int jj_start = nstl::max(0, div_up(pad_l - ki * dilate_w, stride_w)); - int jj_end = ur_w - nstl::max(0, div_up(ki*dilate_w+pad_r-(kw-1)*dilate_w, stride_w)); + int jj_end = ur_w - nstl::max(0, div_up(ki * dilate_w + pad_r - (kw - 1) * dilate_w, stride_w)); int _start = (!jcp_.exclude_pad) ? 0 : jj_start; int _end = (!jcp_.exclude_pad) ? ur_w : jj_end; for (int ifm2 = 0; ifm2 < ic_blocks; ifm2++) { for (int jj = _start; jj < _end; jj++) { - int inp_off = ((ki*dilate_w + jj*stride_w - pad_l)*div_up(jcp_.ic, nbits) + - ifm2 * div_up(ic_blk, nbits)) * jcp_.typesize_in; + int inp_off = ((ki * dilate_w + jj * stride_w - pad_l) * div_up(jcp_.ic, nbits) + + ifm2 * div_up(ic_blk, nbits)) * + jcp_.typesize_in; if (h_padded || jj < jj_start || jj >= jj_end) { uni_vmovups(vmm_src, ptr[reg_table + 8 * vlen]); @@ -336,10 +353,11 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int r = 0; r < repeats; r++) { for (int ii = 0; ii < oc_blocks; ii++) { - int ker_off = (ifm2 * kh * kw * div_up(ic_blk, nbits) * oc_blk - + ii * jcp_.nb_ic * div_up(ic_blk, nbits) * kh * kw * oc_blk - + ki * div_up(ic_blk, nbits) * oc_blk - + r * div_up(ic_blk, nbits) * (oc_blk / 2)) * jcp_.typesize_in; + int ker_off = + (ifm2 * kh * kw * div_up(ic_blk, nbits) * oc_blk + + ii * jcp_.nb_ic * div_up(ic_blk, nbits) * kh * kw * oc_blk + + ki * div_up(ic_blk, nbits) * oc_blk + r * div_up(ic_blk, nbits) * (oc_blk / 2)) * + jcp_.typesize_in; uni_vmovups(vmm_tmp, ptr[aux1_reg_kernel + ker_off]); @@ -350,7 +368,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (mayiuse(x64::avx512_vpopcnt)) { vpopcntd(vmm_tmp, vmm_tmp); uni_vpaddd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), - Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_tmp); + Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_tmp); } else { if (isa == x64::sse41) { movups(vmm_tmp1, vmm_tmp); @@ -375,12 +394,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } if (mayiuse(avx512_core_vnni)) { - vpdpbusd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_tmp, vmm_one_u8); + vpdpbusd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_tmp, + vmm_one_u8); } else { uni_vpmaddubsw(vmm_tmp, vmm_tmp, vmm_one_u8); uni_vpmaddwd(vmm_tmp, vmm_tmp, vmm_one_s16); uni_vpaddd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), - Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_tmp); + Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_tmp); } } } @@ -431,22 +453,22 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ int nbits = 8; const int inp_mult = dilate_h * div_up(jcp_.ic, nbits); - Label t_overflow_label, no_t_overflow_label, - b_overflow_label, no_b_overflow_label; + Label t_overflow_label, no_t_overflow_label, b_overflow_label, no_b_overflow_label; mov(aux_reg_input, reg_input); mov(aux_reg_kernel, reg_kernel_base); - uni_vmovups(vmm_lookup, ptr[reg_table + 0 * vlen]); - uni_vmovups(vmm_mask, ptr[reg_table + 1 * vlen]); - uni_vmovups(vmm_one_u8, ptr[reg_table + 5 * vlen]); + uni_vmovups(vmm_lookup, ptr[reg_table + 0 * vlen]); + uni_vmovups(vmm_mask, ptr[reg_table + 1 * vlen]); + uni_vmovups(vmm_one_u8, ptr[reg_table + 5 * vlen]); uni_vmovups(vmm_one_s16, ptr[reg_table + 6 * vlen]); if (!jcp_.exclude_pad) { - mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); + mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); cmp(reg_overflow, 0); je(no_t_overflow_label, T_NEAR); - L(t_overflow_label); { + L(t_overflow_label); + { oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true); add(aux_reg_kernel, jcp_.typesize_in * kw * jcp_.oc_block * div_up(jcp_.ic_block, nbits)); @@ -459,8 +481,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ Label skip_kh_loop; mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); - if (!jcp_.exclude_pad || (jcp_.exclude_pad && - (jcp_.kh - 1) * (jcp_.dilate_h + 1) < nstl::max(jcp_.t_pad, jcp_.b_pad))) { + if (!jcp_.exclude_pad || + (jcp_.exclude_pad && (jcp_.kh - 1) * (jcp_.dilate_h + 1) < nstl::max(jcp_.t_pad, jcp_.b_pad))) { cmp(reg_kh, 0); je(skip_kh_loop, T_NEAR); } @@ -481,10 +503,11 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ L(skip_kh_loop); if (!jcp_.exclude_pad) { - mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); + mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); cmp(reg_overflow, 0); je(no_b_overflow_label, T_NEAR); - L(b_overflow_label); { + L(b_overflow_label); + { oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true); add(aux_reg_kernel, jcp_.typesize_in * kw * jcp_.oc_block * div_up(jcp_.ic_block, nbits)); @@ -515,7 +538,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ kmovw(ktail_mask, reg_tmp_32); } - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; for (int r = 0; r < repeats; r++) { int tail_size = isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; bool is_scalar_store = isa == x64::sse41 ? tail_size < jcp_.oc_block / 2 : tail_size < jcp_.oc_block; @@ -524,15 +547,17 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (jcp_.exclude_pad) { mov(reg_tmp_32, jcp_.ic); - imul(reg_tmp_32, ptr[param1 + GET_OFF(kh_padding)]); + imul(reg_tmp_32, ptr[param1 + GET_OFF(kh_padding)]); for (int jj = 0; jj < ur_w; jj++) kw_padding[jj] = 0; for (int ki = 0; ki < jcp_.kw; ki++) { int jj_start = nstl::max(0, div_up(pad_l - ki * (jcp_.dilate_w + 1), jcp_.stride_w)); - int jj_end = ur_w - nstl::max(0, div_up(ki * (jcp_.dilate_w + 1) + pad_r - - (jcp_.kw - 1) * (jcp_.dilate_w + 1), jcp_.stride_w)); + int jj_end = + ur_w - nstl::max(0, + div_up(ki * (jcp_.dilate_w + 1) + pad_r - (jcp_.kw - 1) * (jcp_.dilate_w + 1), + jcp_.stride_w)); for (int jj = jj_start; jj < jj_end; jj++) { kw_padding[jj]++; } @@ -552,8 +577,11 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } for (int ii = 0; ii < oc_blocks; ii++) { - uni_vcvtdq2ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj)); - uni_vfmadd213ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_scale, vmm_shift); + uni_vcvtdq2ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj)); + uni_vfmadd213ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_scale, + vmm_shift); } } @@ -580,7 +608,9 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int ii = 0; ii < oc_blocks; ii++) { depthwise_injectors[depthwise_inj_idx]->compute_vector_range(start_idx + ur_w * ii, - start_idx + ur_w * ii + ur_w, reg_d_weights, reg_d_weights); + start_idx + ur_w * ii + ur_w, + reg_d_weights, + reg_d_weights); add(reg_d_weights, jcp_.oc_block * sizeof(float)); } @@ -596,7 +626,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (is_scalar_store) { if (isa == x64::avx512_core) { - int o_off = jj * jcp_.oc * jcp_.ngroups; + int o_off = jj * jcp_.oc * jcp_.ngroups; Vmm vmm_in = vmm_sum | ktail_mask | T_z; @@ -604,7 +634,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ uni_vaddps(vmm_dst, vmm_dst, vmm_sum); } else { for (int oc = 0; oc < tail_size; oc++) { - int o_off = jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2) + oc; + int o_off = jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2) + oc; uni_vpxor(vmm_sum, vmm_sum, vmm_sum); cvt2ps(jcp_.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp_.typesize_out], true); @@ -621,7 +651,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } } } else { - size_t o_off = ii * jcp_.oc_block + jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2); + size_t o_off = + ii * jcp_.oc_block + jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2); cvt2ps(jcp_.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp_.typesize_out], false); uni_vaddps(vmm_dst, vmm_dst, vmm_sum); @@ -649,10 +680,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int ii = 0; ii < oc_blocks; ii++) { for (int jj = 0; jj < ur_w; jj++) { for (int r = 0; r < repeats; r++) { - int tail_size = isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; + int tail_size = + isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; mov(reg_b_mask, (1 << tail_size) - 1); - uni_vmovups(vmm_thr, ptr[reg_b_weights + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); - uni_vmovups(vmm_out_mask, ptr[reg_b_out_mask + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); + uni_vmovups( + vmm_thr, + ptr[reg_b_weights + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); + uni_vmovups( + vmm_out_mask, + ptr[reg_b_out_mask + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); Vmm vmm_dst = Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj); @@ -693,7 +729,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } } else { for (int r = 0; r < repeats; r++) { - int tail_size = isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; + int tail_size = + isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; bool is_scalar_store = isa == x64::sse41 ? tail_size < jcp_.oc_block / 2 : tail_size < jcp_.oc_block; if (is_scalar_store) { for (int jj = 0; jj < ur_w; jj++) { @@ -735,7 +772,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ size_t o_off; if (jcp_.with_dw_conv) - o_off = ((size_t) ii * jcp_dw_conv_.kh * jcp_.ow + jj) * jcp_.oc_block + + o_off = ((size_t)ii * jcp_dw_conv_.kh * jcp_.ow + jj) * jcp_.oc_block + r * (jcp_.oc_block / 2); else o_off = ii * jcp_.oc_block + jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2); @@ -759,14 +796,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ int nbits = 8; const int inp_mult = div_up(jcp_.ic, nbits); - const int out_mult = jcp_.with_dw_conv ? jcp_.oc_block : jcp_.with_binarization ? div_up(jcp_.oc, nbits) : jcp_.oc; + const int out_mult = jcp_.with_dw_conv ? jcp_.oc_block + : jcp_.with_binarization ? div_up(jcp_.oc, nbits) + : jcp_.oc; int l_pad = jcp_.l_pad; - int r_pad = nstl::max(0, (jcp_.ow - 1) * str_w + (kw - 1) * dilate_w - - (iw + l_pad - 1)); - int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w - - (iw + l_pad - 1); - if (r_pad1 > 0) n_oi--; + int r_pad = nstl::max(0, (jcp_.ow - 1) * str_w + (kw - 1) * dilate_w - (iw + l_pad - 1)); + int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w - (iw + l_pad - 1); + if (r_pad1 > 0) + n_oi--; mov(reg_input, reg_input_base); mov(reg_output, reg_output_base); @@ -779,9 +817,9 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (l_pad > 0) { n_oi--; if (n_oi < 0 && r_pad1 > 0) - width_blk_step(ur_w, l_pad, r_pad1, oc_blocks, oc_step); // "lrpad" + width_blk_step(ur_w, l_pad, r_pad1, oc_blocks, oc_step); // "lrpad" else - width_blk_step(ur_w, l_pad, 0, oc_blocks, oc_step); // "lpad" + width_blk_step(ur_w, l_pad, 0, oc_blocks, oc_step); // "lpad" add(reg_input, jcp_.typesize_in * (ur_w * str_w - l_pad) * inp_mult); add(reg_output, jcp_.typesize_out * ur_w * out_mult); } @@ -792,7 +830,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (n_oi > 0) { L(ow_loop_label); - width_blk_step(ur_w, 0, 0, oc_blocks, oc_step); // "middle" + width_blk_step(ur_w, 0, 0, oc_blocks, oc_step); // "middle" add(reg_input, jcp_.typesize_in * ur_w * str_w * inp_mult); add(reg_output, jcp_.typesize_out * ur_w * out_mult); @@ -801,14 +839,14 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ jl(ow_loop_label, T_NEAR); } - if (r_pad1 > 0 && n_oi >=0) { - width_blk_step(ur_w, 0, r_pad1, oc_blocks, oc_step); // "rpad" + if (r_pad1 > 0 && n_oi >= 0) { + width_blk_step(ur_w, 0, r_pad1, oc_blocks, oc_step); // "rpad" add(reg_input, jcp_.typesize_in * ur_w * str_w * inp_mult); add(reg_output, jcp_.typesize_out * ur_w * out_mult); } if (ur_w_tail != 0) - width_blk_step(ur_w_tail, 0, r_pad, oc_blocks, oc_step); // "tail" + width_blk_step(ur_w_tail, 0, r_pad, oc_blocks, oc_step); // "tail" pop(reg_oc_off); pop(reg_oc_work); @@ -817,17 +855,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } void prepare_table() { - const unsigned int cvals[] = { - 0x02010100, // 0 1 1 2 - 0x03020201, // 1 2 2 3 - 0x03020201, // 1 2 2 3 - 0x04030302, // 2 3 3 4 - 0x0f0f0f0f, - 0x000000ff, - 0xc0000000, // -2.0f - 0x01010101, - 0x00010001 - }; + const unsigned int cvals[] = {0x02010100, // 0 1 1 2 + 0x03020201, // 1 2 2 3 + 0x03020201, // 1 2 2 3 + 0x04030302, // 2 3 3 4 + 0x0f0f0f0f, + 0x000000ff, + 0xc0000000, // -2.0f + 0x01010101, + 0x00010001}; size_t simd_w = vlen / sizeof(int32_t); @@ -876,7 +912,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } }; #endif -bool BinaryConvolution::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool BinaryConvolution::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (isDynamicNgraphNode(op)) { errorMessage = "Doesn't support op with dynamic shapes"; @@ -934,7 +971,7 @@ void BinaryConvolution::getSupportedDescriptors() { withSum = false; size_t expectedInputEdgesNum = 2; for (size_t i = 0; i < fusedWith.size(); i++) { - auto *eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { withSum = true; expectedInputEdgesNum++; @@ -979,22 +1016,30 @@ void BinaryConvolution::initSupportedPrimitiveDescriptors() { if (implType != impl_desc_type::ref) { // optimzed implementation -// auto weiFormat = implType == impl_desc_type::jit_avx512 ? memory::format_tag::OhIw16o32i : memory::format_tag::OhIw8o32i; + // auto weiFormat = implType == impl_desc_type::jit_avx512 ? memory::format_tag::OhIw16o32i : + // memory::format_tag::OhIw8o32i; - //activation + // activation auto nspcCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::nspc); config.inConfs[0].setMemDesc(nspcCreator->createSharedDesc(ov::element::u1, getInputShapeAtPort(0))); - //weights - size_t weiFirstDimBlockSize = implType == impl_desc_type::jit_avx512 ? 16 : 8; //memory::format_tag::OIhw16o32i : memory::format_tag::OIhw8o32i; + // weights + size_t weiFirstDimBlockSize = implType == impl_desc_type::jit_avx512 + ? 16 + : 8; // memory::format_tag::OIhw16o32i : memory::format_tag::OIhw8o32i; auto weiDims = getInputShapeAtPort(1).getStaticDims(); - std::vector weiBlockDims = {div_up(weiDims[0], weiFirstDimBlockSize), div_up(weiDims[1], 32), - weiDims[2], weiDims[3], weiFirstDimBlockSize, 32}; + std::vector weiBlockDims = {div_up(weiDims[0], weiFirstDimBlockSize), + div_up(weiDims[1], 32), + weiDims[2], + weiDims[3], + weiFirstDimBlockSize, + 32}; std::vector weiOrder = {0, 1, 2, 3, 0, 1}; - config.inConfs[1].setMemDesc(std::make_shared(ov::element::u1, Shape(weiDims), weiBlockDims, weiOrder)); + config.inConfs[1].setMemDesc( + std::make_shared(ov::element::u1, Shape(weiDims), weiBlockDims, weiOrder)); - //result + // result auto outputPrecision = withBinarization ? ov::element::u1 : ov::element::f32; config.outConfs[0].setMemDesc(nspcCreator->createSharedDesc(outputPrecision, getOutputShapeAtPort(0))); if (withSum) { @@ -1056,14 +1101,15 @@ void BinaryConvolution::createPrimitive() { jcp.with_dw_conv = false; jcp.with_binarization = withBinarization; - const auto &p = (*attr.get()).post_ops_; + const auto& p = (*attr.get()).post_ops_; jcp.with_sum = p.find(primitive_kind::sum) != -1; jcp.with_binarization = p.find(primitive_kind::binarization) != -1; int simd_w = implType == impl_desc_type::jit_avx512 ? 16 : 8; jcp.ur_w = implType == impl_desc_type::jit_avx512 ? 4 : 2; - if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; + if (jcp.ow < jcp.ur_w) + jcp.ur_w = jcp.ow; jcp.ur_w_tail = jcp.ow % jcp.ur_w; jcp.ic_block = 32; @@ -1073,7 +1119,10 @@ void BinaryConvolution::createPrimitive() { jcp.oc_block = simd_w; jcp.nb_oc = div_up(jcp.oc, jcp.oc_block); - jcp.nb_oc_blocking = nstl::min(implType == impl_desc_type::jit_sse42 ? 2 : implType == impl_desc_type::jit_avx2 ? 4 : 6, jcp.nb_oc); + jcp.nb_oc_blocking = nstl::min(implType == impl_desc_type::jit_sse42 ? 2 + : implType == impl_desc_type::jit_avx2 ? 4 + : 6, + jcp.nb_oc); auto srcPrecision = getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); auto dstPrecision = getChildEdgeAt(0)->getMemory().getDesc().getPrecision(); @@ -1082,11 +1131,13 @@ void BinaryConvolution::createPrimitive() { jcp.typesize_in = srcPrecision == ov::element::u1 ? 1 : srcPrecision.size(); jcp.typesize_out = dstPrecision == ov::element::u1 ? 1 : dstPrecision.size(); - int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w - + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + int r_pad_no_tail = nstl::max( + 0, + (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); - bool args_ok = (jcp.l_pad <= jcp.ur_w) && (r_pad_no_tail <= jcp.ur_w) && - IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) || (jcp.stride_w == 1 && jcp.stride_h == 1)); + bool args_ok = + (jcp.l_pad <= jcp.ur_w) && (r_pad_no_tail <= jcp.ur_w) && + IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) || (jcp.stride_w == 1 && jcp.stride_h == 1)); if (!args_ok) OPENVINO_THROW("BinaryConvolution with name '", getName(), "' has unsupported parameters"); #if defined(OPENVINO_ARCH_X86_64) @@ -1122,12 +1173,12 @@ bool BinaryConvolution::canFuse(const NodePtr& node) const { } } -void BinaryConvolution::setPostOps(dnnl::primitive_attr &attr) { +void BinaryConvolution::setPostOps(dnnl::primitive_attr& attr) { dnnl::post_ops ops; postOpsDataPtrs.clear(); - for (auto &node : fusedWith) { - auto* eltwiseNode = dynamic_cast(node.get()); + for (auto& node : fusedWith) { + auto* eltwiseNode = dynamic_cast(node.get()); if (eltwiseNode) { if (eltwiseNode->isSpecialConvolutionAddFusing()) { ops.append_sum(1.0); @@ -1138,7 +1189,7 @@ void BinaryConvolution::setPostOps(dnnl::primitive_attr &attr) { continue; } - auto* fakeQuantizeNode = dynamic_cast(node.get()); + auto* fakeQuantizeNode = dynamic_cast(node.get()); if (fakeQuantizeNode) { fakeQuantizeNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), postOpsDataPtrs); continue; @@ -1154,9 +1205,13 @@ void BinaryConvolution::setPostOps(dnnl::primitive_attr &attr) { attr.set_post_ops(ops); } -void BinaryConvolution::executeOptimized(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) { - auto dst_f32 = reinterpret_cast(dst); +void BinaryConvolution::executeOptimized(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str) { + auto dst_f32 = reinterpret_cast(dst); const int MB = jcp.mb; @@ -1170,26 +1225,28 @@ void BinaryConvolution::executeOptimized(const uint8_t* src, const uint8_t* weig auto par_conv = jit_bin_conv_call_args(); const int ij = oh * jcp.stride_h; - const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h+1))); - const int i_b_overflow = nstl::min(jcp.kh, div_up(nstl::max(jcp.ih, ij + (jcp.kh-1) * (jcp.dilate_h+1) - - jcp.t_pad+1) - jcp.ih, (jcp.dilate_h + 1))); + const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h + 1))); + const int i_b_overflow = + nstl::min(jcp.kh, + div_up(nstl::max(jcp.ih, ij + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih, + (jcp.dilate_h + 1))); const size_t _oc = g * jcp.nb_oc + ocb; const size_t _ic = g * jcp.nb_ic; const int ih = nstl::max(ij - jcp.t_pad + i_t_overflow * (jcp.dilate_h + 1), 0); - par_conv.src = &src[(n * s_str[0] + _ic*jcp.ic_block * s_str[1] + ih * s_str[2]) / nbits]; + par_conv.src = &src[(n * s_str[0] + _ic * jcp.ic_block * s_str[1] + ih * s_str[2]) / nbits]; if (jcp.with_binarization) { - par_conv.dst = &dst[(n * d_str[0] + _oc*jcp.oc_block * d_str[1] + oh * d_str[2]) / nbits]; + par_conv.dst = &dst[(n * d_str[0] + _oc * jcp.oc_block * d_str[1] + oh * d_str[2]) / nbits]; } else { - par_conv.dst = &dst_f32[n * d_str[0] + _oc*jcp.oc_block * d_str[1] + oh * d_str[2]]; + par_conv.dst = &dst_f32[n * d_str[0] + _oc * jcp.oc_block * d_str[1] + oh * d_str[2]]; } const int wh = jcp.exclude_pad ? i_t_overflow : 0; par_conv.filt = &weights[(ocb * w_str[0] + wh * w_str[2]) / nbits]; - par_conv.oc_work = nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb*jcp.oc_block; + par_conv.oc_work = nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb * jcp.oc_block; par_conv.kw_padding = 0; const int kh_padding = jcp.kh - i_t_overflow - i_b_overflow; @@ -1204,9 +1261,13 @@ void BinaryConvolution::executeOptimized(const uint8_t* src, const uint8_t* weig }); } -void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) { - auto dst_fp = reinterpret_cast(dst); +void BinaryConvolution::executeReference(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str) { + auto dst_fp = reinterpret_cast(dst); const bool with_groups = jcp.ngroups > 1; @@ -1240,7 +1301,7 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig return (uint8_t)((val >> bit) & 0x0001); }; - auto ker = [=](int32_t &d, int g, int mb, int oc, int oh, int ow) { + auto ker = [=](int32_t& d, int g, int mb, int oc, int oh, int ow) { for (int ic = 0; ic < IC; ++ic) { for (int kh = 0; kh < KH; ++kh) { for (int kw = 0; kw < KW; ++kw) { @@ -1259,14 +1320,14 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig if (pad_value == 0) continue; else - s = pad_value == 1.0f ? (uint8_t) 1 : (uint8_t) 0; + s = pad_value == 1.0f ? (uint8_t)1 : (uint8_t)0; } else { - s = extract_bit(src[iidx / nbits], (uint8_t) (iidx % nbits)); + s = extract_bit(src[iidx / nbits], (uint8_t)(iidx % nbits)); } - uint8_t w = extract_bit(weights[widx / nbits], (uint8_t) (widx % nbits)); + uint8_t w = extract_bit(weights[widx / nbits], (uint8_t)(widx % nbits)); - d += (int32_t) (s ^ w); + d += (int32_t)(s ^ w); } } } @@ -1280,13 +1341,11 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig if (pad_value == 0.0f) { const int i_left_overflow = nstl::max(0, (padL - ow * KSW)); const int i_right_overflow = nstl::max(IW, (ow * KSW + (KW - 1) * (KDW + 1) - padL + 1)) - IW; - const int kw_padding = - KW - div_up(i_left_overflow, (KDW + 1)) - div_up(i_right_overflow, (KDW + 1)); + const int kw_padding = KW - div_up(i_left_overflow, (KDW + 1)) - div_up(i_right_overflow, (KDW + 1)); const int i_top_overflow = nstl::max(0, (padT - oh * KSH)); const int i_bottom_overflow = nstl::max(IH, (oh * KSH + (KH - 1) * (KDH + 1) - padT + 1)) - IH; - const int kh_padding = - KH - div_up(i_top_overflow, (KDH + 1)) - div_up(i_bottom_overflow, (KDH + 1)); + const int kh_padding = KH - div_up(i_top_overflow, (KDH + 1)) - div_up(i_bottom_overflow, (KDH + 1)); base_value = IC * kh_padding * kw_padding; } else { @@ -1295,7 +1354,7 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig float a_fp = base_value - static_cast(2 * a); - dst_fp[mb * d_str[0] + (g*OC + oc) * d_str[1] + oh * d_str[2] + ow * d_str[3]] = a_fp; + dst_fp[mb * d_str[0] + (g * OC + oc) * d_str[1] + oh * d_str[2] + ow * d_str[3]] = a_fp; }); } @@ -1342,6 +1401,6 @@ bool BinaryConvolution::created() const { return getType() == Type::BinaryConvolution; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bin_conv.h b/src/plugins/intel_cpu/src/nodes/bin_conv.h index 86b5cb41b2bf6d..661e075b680ec7 100644 --- a/src/plugins/intel_cpu/src/nodes/bin_conv.h +++ b/src/plugins/intel_cpu/src/nodes/bin_conv.h @@ -39,9 +39,9 @@ struct jit_dw_conv_params { }; struct jit_bin_conv_call_args { - const void *src; - const void *dst; - const void *filt; + const void* src; + const void* dst; + const void* filt; size_t kh_padding; size_t kw_padding; size_t oc_work; @@ -52,15 +52,20 @@ struct jit_bin_conv_call_args { }; struct jit_uni_bin_conv_kernel { - void (*ker_)(const jit_bin_conv_call_args *); + void (*ker_)(const jit_bin_conv_call_args*); - void operator()(const jit_bin_conv_call_args *args) { + void operator()(const jit_bin_conv_call_args* args) { assert(ker_); ker_(args); } - explicit jit_uni_bin_conv_kernel(jit_bin_conv_params jcp, jit_dw_conv_params jcp_dw_conv, const dnnl_primitive_attr &attr) : - ker_(nullptr), jcp_(jcp), jcp_dw_conv_(jcp_dw_conv), attr_(attr) {} + explicit jit_uni_bin_conv_kernel(jit_bin_conv_params jcp, + jit_dw_conv_params jcp_dw_conv, + const dnnl_primitive_attr& attr) + : ker_(nullptr), + jcp_(jcp), + jcp_dw_conv_(jcp_dw_conv), + attr_(attr) {} virtual ~jit_uni_bin_conv_kernel() {} virtual void create_ker() = 0; @@ -68,7 +73,7 @@ struct jit_uni_bin_conv_kernel { jit_bin_conv_params jcp_; jit_dw_conv_params jcp_dw_conv_; - const dnnl_primitive_attr &attr_; + const dnnl_primitive_attr& attr_; }; class BinaryConvolution : public Node { @@ -83,12 +88,14 @@ class BinaryConvolution : public Node { bool canBeInPlace() const override { return false; } - void setPostOps(dnnl::primitive_attr &attr); + void setPostOps(dnnl::primitive_attr& attr); bool canFuse(const NodePtr& node) const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - impl_desc_type getImplType() { return implType; } + impl_desc_type getImplType() { + return implType; + } private: bool withSum = false; @@ -110,14 +117,22 @@ class BinaryConvolution : public Node { impl_desc_type implType = impl_desc_type::ref; - void executeOptimized(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str); - void executeReference(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str); + void executeOptimized(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str); + void executeReference(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str); std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.cpp b/src/plugins/intel_cpu/src/nodes/broadcast.cpp index c88803e07de601..646e186922b397 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.cpp +++ b/src/plugins/intel_cpu/src/nodes/broadcast.cpp @@ -2,15 +2,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "broadcast.h" + +#include + #include +#include + +#include "common/cpu_memcpy.h" #include "dnnl_types.h" -#include "openvino/core/parallel.hpp" -#include -#include "broadcast.h" #include "nodes/common/blocked_desc_creator.h" +#include "openvino/core/parallel.hpp" #include "openvino/opsets/opset1.hpp" -#include "common/cpu_memcpy.h" #include "utils/ngraph_utils.hpp" namespace ov { @@ -24,19 +27,20 @@ bool Broadcast::isSupportedOperation(const std::shared_ptr& op, return false; } if (!one_of(ov::as_type_ptr(op)->get_broadcast_spec().m_type, - ov::op::AutoBroadcastType::NUMPY, ov::op::AutoBroadcastType::EXPLICIT)) { + ov::op::AutoBroadcastType::NUMPY, + ov::op::AutoBroadcastType::EXPLICIT)) { errorMessage = "Only NUMPY and EXPLICIT broadcast types are supported."; return false; } if (op->get_input_partial_shape(TARGET_SHAPE_IDX).is_dynamic() || - (op->get_input_size() > AXES_MAPPING_IDX && op->get_input_partial_shape(AXES_MAPPING_IDX).is_dynamic())) { + (op->get_input_size() > AXES_MAPPING_IDX && op->get_input_partial_shape(AXES_MAPPING_IDX).is_dynamic())) { errorMessage = "Only static shapes are supported for target shape and axes mapping inputs."; return false; } if (!isDynamicNgraphNode(op) && - (!ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)) || - (op->get_input_size() > AXES_MAPPING_IDX && - !ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))))) { + (!ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)) || + (op->get_input_size() > AXES_MAPPING_IDX && + !ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))))) { errorMessage = "Only constant target shapes and axis mapping inputs are supported for static shapes."; return false; } @@ -72,12 +76,13 @@ Broadcast::Broadcast(const std::shared_ptr& op, const GraphContext::CP if (ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX))) { constMap[TARGET_SHAPE_IDX] = true; - targetShape = (ov::as_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)))->get_vector(); + targetShape = + (ov::as_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)))->get_vector(); } - if (broadcastType == EXPLICIT && - ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))) { + if (broadcastType == EXPLICIT && ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))) { constMap[AXES_MAPPING_IDX] = true; - axesMapping = ov::as_type(op->get_input_node_ptr(AXES_MAPPING_IDX))->get_vector(); + axesMapping = + ov::as_type(op->get_input_node_ptr(AXES_MAPPING_IDX))->get_vector(); } } @@ -126,7 +131,8 @@ void Broadcast::prepareParams() { repeats.assign(targetShape.begin(), targetShape.end()); const auto ndims = repeats.size(); - auto srcBlockedDims = getParentEdgeAt(INPUT_DATA_IDX)->getMemory().getDescWithType()->getBlockDims(); + auto srcBlockedDims = + getParentEdgeAt(INPUT_DATA_IDX)->getMemory().getDescWithType()->getBlockDims(); auto dstBlockedDims = getChildEdgeAt(0)->getMemory().getDescWithType()->getBlockDims(); if (broadcastType == NUMPY) { @@ -227,8 +233,8 @@ void Broadcast::plainExecute(dnnl::stream strm) { } const size_t workAmountDst = dstStrides[0] * dstDims[0]; - const auto *srcData = getSrcDataAtPortAs(INPUT_DATA_IDX); - auto *dstData = getDstDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(INPUT_DATA_IDX); + auto* dstData = getDstDataAtPortAs(0); parallel_nt(0, [&](const int ithr, const int nthr) { size_t i = 0lu, srcIdx = 0lu, start = 0lu, end = 0lu; @@ -246,7 +252,8 @@ void Broadcast::plainExecute(dnnl::stream strm) { for (int j = dataDstRank - 1; j >= 0; j--) { counters[j] = (counters[j] + 1) % dstDims[j]; - if (counters[j] != 0) break; + if (counters[j] != 0) + break; } } }); @@ -256,6 +263,6 @@ bool Broadcast::created() const { return getType() == Type::Broadcast; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.h b/src/plugins/intel_cpu/src/nodes/broadcast.h index 1435314ee08776..df9ad4614e311d 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.h +++ b/src/plugins/intel_cpu/src/nodes/broadcast.h @@ -4,12 +4,12 @@ #pragma once -#include "common/tile_broadcast_utils.h" - #include #include #include +#include "common/tile_broadcast_utils.h" + namespace ov { namespace intel_cpu { namespace node { @@ -35,10 +35,7 @@ class Broadcast : public Node, public TileBroadcastCommon { private: void plainExecute(dnnl::stream strm); - enum AutoBroadcastType { - NUMPY, - EXPLICIT - }; + enum AutoBroadcastType { NUMPY, EXPLICIT }; AutoBroadcastType broadcastType = NUMPY; static constexpr size_t INPUT_DATA_IDX = 0; @@ -51,6 +48,6 @@ class Broadcast : public Node, public TileBroadcastCommon { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.cpp b/src/plugins/intel_cpu/src/nodes/bucketize.cpp index a71255c0d531e4..cfa4bb031501ef 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.cpp +++ b/src/plugins/intel_cpu/src/nodes/bucketize.cpp @@ -2,14 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "bucketize.h" + +#include +#include #include #include -#include -#include "openvino/opsets/opset3.hpp" -#include #include "openvino/core/parallel.hpp" -#include "bucketize.h" +#include "openvino/opsets/opset3.hpp" namespace ov { namespace intel_cpu { @@ -70,16 +71,15 @@ void Bucketize::initSupportedPrimitiveDescriptors() { output_precision = ov::element::i32; } - addSupportedPrimDesc({{LayoutType::ncsp, input_precision}, - {LayoutType::ncsp, boundaries_precision}}, + addSupportedPrimDesc({{LayoutType::ncsp, input_precision}, {LayoutType::ncsp, boundaries_precision}}, {{LayoutType::ncsp, output_precision}}, impl_desc_type::ref_any); } inline constexpr uint32_t getElementsMask(ov::element::Type precision1, - ov::element::Type precision2, - ov::element::Type precision3 = ov::element::undefined, - ov::element::Type precision4 = ov::element::undefined) { + ov::element::Type precision2, + ov::element::Type precision3 = ov::element::undefined, + ov::element::Type precision4 = ov::element::undefined) { return static_cast(ov::element::Type_t(precision1)) | (static_cast(ov::element::Type_t(precision2)) << 8) | (static_cast(ov::element::Type_t(precision3)) << 16) | @@ -90,98 +90,98 @@ void Bucketize::execute(dnnl::stream strm) { auto precision_mask = getElementsMask(input_precision, boundaries_precision, output_precision); switch (precision_mask) { - case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - default: - OPENVINO_THROW(errorPrefix, " has unsupported precision: ", precision_mask); + case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + default: + OPENVINO_THROW(errorPrefix, " has unsupported precision: ", precision_mask); } } @@ -222,9 +222,9 @@ bool Bucketize::isExecutable() const { template void Bucketize::bucketize() { - const auto *input_data = getSrcDataAtPortAs(0); - const auto *boundaries_data = getSrcDataAtPortAs(1); - auto *output_data = getDstDataAtPortAs(0); + const auto* input_data = getSrcDataAtPortAs(0); + const auto* boundaries_data = getSrcDataAtPortAs(1); + auto* output_data = getDstDataAtPortAs(0); if (!with_bins) { memset(output_data, 0, num_values * sizeof(T_IND)); @@ -248,6 +248,6 @@ bool Bucketize::created() const { return getType() == Type::Bucketize; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.h b/src/plugins/intel_cpu/src/nodes/bucketize.h index c834921a38ce54..0ecdd633838950 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.h +++ b/src/plugins/intel_cpu/src/nodes/bucketize.h @@ -14,7 +14,7 @@ class Bucketize : public Node { public: Bucketize(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -46,6 +46,6 @@ class Bucketize : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp index 674d77265c9219..fd015a372ed1db 100644 --- a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp +++ b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp @@ -4,16 +4,16 @@ #include "causal_mask_preprocess.h" +#include +#include +#include + #include "common/bfloat16.hpp" #include "common/cpu_memcpy.h" #include "cpu/x64/cpu_isa_traits.hpp" #include "shape_inference/shape_inference_internal_dyn.hpp" #include "utils/plain_tensor.hpp" -#include -#include -#include - namespace ov { namespace intel_cpu { namespace node { @@ -48,7 +48,7 @@ The functionality is equivalent to following python code: template struct CausalMaskPreprocess::ExecutorCausalMaskPreprocess : public CausalMaskPreprocess::Executor { void execute(dnnl::stream strm, - intel_cpu::Node * pnode, + intel_cpu::Node* pnode, const intel_cpu::CausalMaskPreprocessNode::Config& config) override { ov::intel_cpu::PlainTensor t_attention_mask(pnode->getSrcMemoryAtPort(0)); ov::intel_cpu::PlainTensor t_batch_size(pnode->getSrcMemoryAtPort(1)); @@ -64,7 +64,14 @@ struct CausalMaskPreprocess::ExecutorCausalMaskPreprocess : public CausalMaskPre pnode->redefineOutputMemory({newDims}); ov::intel_cpu::PlainTensor t_dst(pnode->getDstMemoryAtPort(0)); - DEBUG_LOG("CausalMaskPreprocess::execute", config.type, " batch_size=", batch_size, " qLen=", qLen, " kvLen=", kvLen); + DEBUG_LOG("CausalMaskPreprocess::execute", + config.type, + " batch_size=", + batch_size, + " qLen=", + qLen, + " kvLen=", + kvLen); DEBUG_LOG("CausalMaskPreprocess::execute attention_mask=", t_attention_mask); DEBUG_LOG("CausalMaskPreprocess::execute cache_positions=", t_cache_positions); @@ -81,7 +88,7 @@ struct CausalMaskPreprocess::ExecutorCausalMaskPreprocess : public CausalMaskPre bool cmask_eq0 = (j <= row); bool amask_eq0 = (pamask[j] == 0); bool padding_mask = (cmask_eq0 && amask_eq0); - pdst[j] = (padding_mask | (!cmask_eq0))? min_dtype : T(0); + pdst[j] = (padding_mask | (!cmask_eq0)) ? min_dtype : T(0); } for (; j < kvLen; j++) { bool cmask_eq0 = (j <= row); @@ -103,7 +110,8 @@ CausalMaskPreprocess::CausalMaskPreprocess(const std::shared_ptr& op, m_config = node->get_config(); } -bool CausalMaskPreprocess::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool CausalMaskPreprocess::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto node = std::dynamic_pointer_cast(op); if (!node) { @@ -133,7 +141,8 @@ void CausalMaskPreprocess::initSupportedPrimitiveDescriptors() { oprecs[0] = ov::element::f32; } // all input precisions must be int32 - for (auto& prec : iprecs) prec = ov::element::i32; + for (auto& prec : iprecs) + prec = ov::element::i32; } else { OPENVINO_THROW("CPU: CausalMaskPreprocess type not supported : " + m_config.type); } diff --git a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h index eeb997c4cefb9f..444f242b0597a7 100644 --- a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h +++ b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h @@ -32,7 +32,7 @@ class CausalMaskPreprocess : public Node { private: struct Executor { virtual void execute(dnnl::stream strm, - intel_cpu::Node * pnode, + intel_cpu::Node* pnode, const intel_cpu::CausalMaskPreprocessNode::Config& config) = 0; virtual ~Executor() = default; }; diff --git a/src/plugins/intel_cpu/src/nodes/col2im.cpp b/src/plugins/intel_cpu/src/nodes/col2im.cpp index 4b83e78fd82505..409607ea6bb89c 100644 --- a/src/plugins/intel_cpu/src/nodes/col2im.cpp +++ b/src/plugins/intel_cpu/src/nodes/col2im.cpp @@ -3,8 +3,9 @@ // #include "col2im.h" -#include "openvino/reference/col2im.hpp" + #include "openvino/op/col2im.hpp" +#include "openvino/reference/col2im.hpp" namespace ov { namespace intel_cpu { @@ -62,42 +63,42 @@ void Col2Im::executeDynamicImpl(dnnl::stream strm) { template void Col2Im::executeImpl() { - ov::reference::col2im( - getSrcDataAtPortAs(0), - ov::Shape{getSrcMemoryAtPort(0)->getStaticDims()}, - getSrcDataAtPortAs(1), - getSrcDataAtPortAs(2), - getDstDataAtPortAs(0), - strides, - dilations, - padsBegin, - padsEnd); + ov::reference::col2im(getSrcDataAtPortAs(0), + ov::Shape{getSrcMemoryAtPort(0)->getStaticDims()}, + getSrcDataAtPortAs(1), + getSrcDataAtPortAs(2), + getDstDataAtPortAs(0), + strides, + dilations, + padsBegin, + padsEnd); } namespace { struct Col2ImContext { - Col2Im &node; + Col2Im& node; }; -} +} // namespace -template +template struct Col2Im::Col2ImExecute { using TData = typename std::tuple_element<0, T>::type; using TIndex = typename std::tuple_element<1, T>::type; - void operator()(Col2ImContext & ctx) { - ctx.node.executeImpl(); - } + void operator()(Col2ImContext& ctx) { + ctx.node.executeImpl(); + } }; void Col2Im::execute(dnnl::stream strm) { auto dataPrecision = getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); auto indexPrecision = getParentEdgeAt(1)->getMemory().getDesc().getPrecision(); - Col2ImContext ctx = { - *this - }; + Col2ImContext ctx = {*this}; - OV_SWITCH(intel_cpu, Col2ImExecute, ctx, std::tie(dataPrecision, indexPrecision), + OV_SWITCH(intel_cpu, + Col2ImExecute, + ctx, + std::tie(dataPrecision, indexPrecision), OV_CASE2(ov::element::f32, ov::element::i32, float, int32_t), OV_CASE2(ov::element::f16, ov::element::i32, ov::float16, int32_t), OV_CASE2(ov::element::bf16, ov::element::i32, ov::bfloat16, int32_t), diff --git a/src/plugins/intel_cpu/src/nodes/col2im.h b/src/plugins/intel_cpu/src/nodes/col2im.h index 9904689e53be0f..b56b4bb78469aa 100644 --- a/src/plugins/intel_cpu/src/nodes/col2im.h +++ b/src/plugins/intel_cpu/src/nodes/col2im.h @@ -26,7 +26,7 @@ class Col2Im : public Node { template void executeImpl(); - template + template struct Col2ImExecute; ov::Strides strides; diff --git a/src/plugins/intel_cpu/src/nodes/color_convert.cpp b/src/plugins/intel_cpu/src/nodes/color_convert.cpp index ea3c8e2c774944..a06214b768d6b4 100644 --- a/src/plugins/intel_cpu/src/nodes/color_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/color_convert.cpp @@ -3,14 +3,17 @@ // #include "color_convert.h" + #include -#include -#include -#include -#include + #include -#include "openvino/core/parallel.hpp" +#include +#include +#include +#include + #include "kernels/x64/jit_kernel.hpp" +#include "openvino/core/parallel.hpp" #include "shape_inference/custom/color_convert.hpp" using namespace dnnl::impl; @@ -39,7 +42,7 @@ class Converter : public ColorConvert::Converter { using Base = ColorConvert::Converter; public: - Converter(Node *node); + Converter(Node* node); bool singlePlane() const; @@ -47,12 +50,12 @@ class Converter : public ColorConvert::Converter { std::tuple yuv_to_rgb(float y, float u, float v); }; -Converter::Converter(Node *node) - : Base(node, node->getAlgorithm() == Algorithm::ColorConvertNV12toRGB - || node->getAlgorithm() == Algorithm::ColorConvertI420toRGB - ? ColorFormat { { 0, 1, 2 } } - : ColorFormat { { 2, 1, 0 } }) { -} +Converter::Converter(Node* node) + : Base(node, + node->getAlgorithm() == Algorithm::ColorConvertNV12toRGB || + node->getAlgorithm() == Algorithm::ColorConvertI420toRGB + ? ColorFormat{{0, 1, 2}} + : ColorFormat{{2, 1, 0}}) {} bool Converter::singlePlane() const { return _node->getOriginalInputsNumber() == 1; @@ -81,46 +84,43 @@ struct jit_uni_converter : public jit_kernel { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_converter) struct Params { - const void * y; - const void * u; - const void * v; - void * dst; + const void* y; + const void* u; + const void* v; + void* dst; size_t width; - uint8_t colorFormat; // RGB: 0, BGR: !=0 + uint8_t colorFormat; // RGB: 0, BGR: !=0 }; - typedef void (*function_t)(const Params *); + typedef void (*function_t)(const Params*); void init(); - void operator()(const Params & args) const { + void operator()(const Params& args) const { _fn(&args); } protected: jit_uni_converter(); - template - void yuv_to_rgb(const variable & y, - const variable & u, - const variable & v, - const variable & color_format, + template + void yuv_to_rgb(const variable& y, + const variable& u, + const variable& v, + const variable& color_format, bool round); - template - void store_tail(const variable & dst, - const variable & a, - const variable & b, - const variable & c, - const variable & size); + template + void store_tail(const variable& dst, + const variable& a, + const variable& b, + const variable& c, + const variable& size); function_t _fn; variable _consts; }; -jit_uni_converter::jit_uni_converter() - : jit_kernel(jit_name()), - _consts(*this) { -} +jit_uni_converter::jit_uni_converter() : jit_kernel(jit_name()), _consts(*this) {} void jit_uni_converter::init() { if (create_kernel() != status::success) @@ -128,15 +128,13 @@ void jit_uni_converter::init() { _fn = (function_t)jit_ker(); } -template -void jit_uni_converter::yuv_to_rgb(const variable & y, - const variable & u, - const variable & v, - const variable & color_format, +template +void jit_uni_converter::yuv_to_rgb(const variable& y, + const variable& u, + const variable& v, + const variable& color_format, bool round) { - auto clip = [&](const variable & op, - const variable & a, - const variable & b) { + auto clip = [&](const variable& op, const variable& a, const variable& b) { if (round) uni_vroundps(op, op, 0); uni_vmaxps(op, op, a); @@ -144,8 +142,12 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, }; // blend r,g,b and put to r0,r1,r2 - auto blend = [&](const variable & r, const variable & g, const variable & b, - const variable & r0, const variable & r1, const variable & r2) { + auto blend = [&](const variable& r, + const variable& g, + const variable& b, + const variable& r0, + const variable& r1, + const variable& r2) { /* Input: r0,r1,r2,r3,r4,r5,r6,r7 @@ -174,7 +176,7 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, */ auto genPermutationMask = [&](int offset) { - std::array mask {}; + std::array mask{}; for (uint8_t i = 0; i < mask.size(); ++i) mask[(i * 3 + offset) % mask.size()] = i; return mask; @@ -184,11 +186,8 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, g.permute(genPermutationMask(1)); b.permute(genPermutationMask(2)); - auto blendWithMask = [&](int offset, const variable & result) { - static const uint32_t blendMasks[2] = { - 0x92492492, - 0x24924924 - }; + auto blendWithMask = [&](int offset, const variable& result) { + static const uint32_t blendMasks[2] = {0x92492492, 0x24924924}; const uint16_t mask0 = static_cast(blendMasks[0] >> ((offset * N) % 3)); const uint16_t mask1 = static_cast(blendMasks[1] >> ((offset * N) % 3)); @@ -208,29 +207,29 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, auto b = var(); auto tmp = var(); - uni_vbroadcastss(tmp, ptr[_consts + 0 * sizeof(float)]); // tmp = [16.0f,16.0f,...] - uni_vsubps(y, y, tmp); // y = y - tmp - uni_vbroadcastss(tmp, ptr[_consts + 1 * sizeof(float)]); // tmp = [128.f,128.f,...] - uni_vsubps(u, u, tmp); // u = u - tmp - uni_vsubps(v, v, tmp); // v = v - tmp + uni_vbroadcastss(tmp, ptr[_consts + 0 * sizeof(float)]); // tmp = [16.0f,16.0f,...] + uni_vsubps(y, y, tmp); // y = y - tmp + uni_vbroadcastss(tmp, ptr[_consts + 1 * sizeof(float)]); // tmp = [128.f,128.f,...] + uni_vsubps(u, u, tmp); // u = u - tmp + uni_vsubps(v, v, tmp); // v = v - tmp - uni_vbroadcastss(tmp, ptr[_consts + 2 * sizeof(float)]); // tmp = [1.164f,1.164f,...] - uni_vmulps(y, y, tmp); // y = y * tmp + uni_vbroadcastss(tmp, ptr[_consts + 2 * sizeof(float)]); // tmp = [1.164f,1.164f,...] + uni_vmulps(y, y, tmp); // y = y * tmp - uni_vbroadcastss(r, ptr[_consts + 3 * sizeof(float)]); // r = [1.596f,1.596f,...] - uni_vmulps(r, r, v); // r = r * v - uni_vaddps(r, r, y); // r = r + y + uni_vbroadcastss(r, ptr[_consts + 3 * sizeof(float)]); // r = [1.596f,1.596f,...] + uni_vmulps(r, r, v); // r = r * v + uni_vaddps(r, r, y); // r = r + y - uni_vbroadcastss(g, ptr[_consts + 4 * sizeof(float)]); // g = [0.391f,0.391f,...] - uni_vmulps(g, g, u); // g = g * u - uni_vsubps(g, y, g); // g = y - g - uni_vbroadcastss(tmp, ptr[_consts + 6 * sizeof(float)]); // tmp = [0.813f,0.813f,...] - uni_vmulps(tmp, tmp, v); // tmp = tmp * v - uni_vsubps(g, g, tmp); // g = g - tmp + uni_vbroadcastss(g, ptr[_consts + 4 * sizeof(float)]); // g = [0.391f,0.391f,...] + uni_vmulps(g, g, u); // g = g * u + uni_vsubps(g, y, g); // g = y - g + uni_vbroadcastss(tmp, ptr[_consts + 6 * sizeof(float)]); // tmp = [0.813f,0.813f,...] + uni_vmulps(tmp, tmp, v); // tmp = tmp * v + uni_vsubps(g, g, tmp); // g = g - tmp - uni_vbroadcastss(b, ptr[_consts + 5 * sizeof(float)]); // b = [2.018f,2.018f,...] - uni_vmulps(b, b, u); // b = b * u - uni_vaddps(b, b, y); // b = b + y + uni_vbroadcastss(b, ptr[_consts + 5 * sizeof(float)]); // b = [2.018f,2.018f,...] + uni_vmulps(b, b, u); // b = b * u + uni_vaddps(b, b, y); // b = b + y // clip uni_vxorps(y, y, y); @@ -241,24 +240,30 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, clip(b, y, u); _if(color_format == 0) - ._then([&]{ blend(r, g, b, y, u, v); }) - ._else([&]{ blend(b, g, r, y, u, v); }); + ._then([&] { + blend(r, g, b, y, u, v); + }) + ._else([&] { + blend(b, g, r, y, u, v); + }); } -template -void jit_uni_converter::store_tail(const variable & dst, - const variable & a, - const variable & b, - const variable & c, - const variable & size) { +template +void jit_uni_converter::store_tail(const variable& dst, + const variable& a, + const variable& b, + const variable& c, + const variable& size) { const size_t step = N * sizeof(T); auto s = stack(3 * step); auto sptr = var(); sptr = s.pointer(); - store(sptr, a); sptr += step; - store(sptr, b); sptr += step; + store(sptr, a); + sptr += step; + store(sptr, b); + sptr += step; store(sptr, c); auto copy_size = size * size_t(3u); @@ -269,36 +274,33 @@ void jit_uni_converter::store_tail(const variable & dst, namespace nv12 { -ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node *node) { - const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 +ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node* node) { + const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 - const ov::element::Type precision = node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 - ? ov::element::u8 - : ov::element::f32; + const ov::element::Type precision = + node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 ? ov::element::u8 : ov::element::f32; ColorConvert::Converter::PrimitiveDescs descs; - descs.emplace_back(std::vector { node->getOriginalInputsNumber(), { layout, precision } }, - std::vector { { layout, precision } }, - mayiuse(cpu_isa_t::sse41) - ? impl_desc_type::jit_uni - : impl_desc_type::ref, - true); + descs.emplace_back(std::vector{node->getOriginalInputsNumber(), {layout, precision}}, + std::vector{{layout, precision}}, + mayiuse(cpu_isa_t::sse41) ? impl_desc_type::jit_uni : impl_desc_type::ref, + true); return descs; } -template +template class SinglePlaneConvert; -template +template class TwoPlaneConvert; class RefConverter : public Converter { public: - RefConverter(Node *node); + RefConverter(Node* node); protected: - template + template void convert(const T* y, const T* uv, T* dst, @@ -309,15 +311,14 @@ class RefConverter : public Converter { size_t stride_uv); }; -RefConverter::RefConverter(Node *node) - : Converter(node) { +RefConverter::RefConverter(Node* node) : Converter(node) { if (node->getOriginalInputsNumber() != (singlePlane() ? 1 : 2)) OPENVINO_THROW("NV12Converter node has incorrect number of inputs"); if (!node->getOriginalOutputsNumber()) OPENVINO_THROW("NV12Converter node has incorrect number of outputs"); } -template +template void RefConverter::convert(const T* y, const T* uv, T* dst, @@ -346,13 +347,13 @@ void RefConverter::convert(const T* y, }); } -template +template class SinglePlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -362,22 +363,17 @@ class SinglePlaneConvert : public RefConverter { const T* uv = y + width * height; T* dst = static_cast(output(0)); - convert(y, uv, dst, - batch_size, - height, - width, - height * width * 3 / 2, - height * width * 3 / 2); + convert(y, uv, dst, batch_size, height, width, height * width * 3 / 2, height * width * 3 / 2); } }; -template +template class TwoPlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const T* y = static_cast(input(0)); const T* uv = static_cast(input(1)); @@ -387,34 +383,24 @@ class TwoPlaneConvert : public RefConverter { const size_t height = dims[H_DIM]; const size_t width = dims[W_DIM]; - convert(y, uv, dst, - batch_size, - height, - width, - height * width, - height * width / 2); + convert(y, uv, dst, batch_size, height, width, height * width, height * width / 2); } }; #if defined(OPENVINO_ARCH_X86_64) -template +template class JitConverter; -template +template class JitConverter : public jit_uni_converter { private: void generate() override; - std::tuple, - variable, - variable> - load_yuv(const variable & src_y, - const variable & src_uv); - std::tuple, - variable> - unpack_uv(const variable & uv); + std::tuple, variable, variable> load_yuv(const variable& src_y, + const variable& src_uv); + std::tuple, variable> unpack_uv(const variable& uv); }; -template +template void JitConverter::generate() { preamble(); @@ -425,7 +411,7 @@ void JitConverter::generate() { auto width = arg(&Params::width); auto colorFormat = arg(&Params::colorFormat); - static const float data[8] = { 16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f }; + static const float data[8] = {16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f}; _consts = data; const size_t reg_capacity_log = static_cast(std::logb(N)); @@ -433,26 +419,29 @@ void JitConverter::generate() { width >>= reg_capacity_log; - foreach(0, width, [&](const Reg64 & idx) { + foreach (0, width, [&](const Reg64& idx) { auto yuv = load_yuv(src_y, src_uv); // Aliases - const auto & y = std::get<0>(yuv); - const auto & u = std::get<1>(yuv); - const auto & v = std::get<2>(yuv); + const auto& y = std::get<0>(yuv); + const auto& u = std::get<1>(yuv); + const auto& v = std::get<2>(yuv); yuv_to_rgb(y, u, v, colorFormat, std::is_integral::value); - store(dst, y); dst += step; - store(dst, u); dst += step; - store(dst, v); dst += step; - }); + store(dst, y); + dst += step; + store(dst, u); + dst += step; + store(dst, v); + dst += step; + }) + ; mov(width, argPtr(&Params::width)); width &= N - 1; - _if(width != 0) - ._then([&] { + _if(width != 0)._then([&] { auto y = var(); auto uv = var(); @@ -462,8 +451,8 @@ void JitConverter::generate() { auto uv_pair = unpack_uv(uv); // Aliases - const auto & u = std::get<0>(uv_pair); - const auto & v = std::get<1>(uv_pair); + const auto& u = std::get<0>(uv_pair); + const auto& v = std::get<1>(uv_pair); yuv_to_rgb(y, u, v, colorFormat, std::is_integral::value); @@ -473,12 +462,9 @@ void JitConverter::generate() { postamble(); } -template -std::tuple, - jit_kernel::variable, - jit_kernel::variable> -JitConverter::load_yuv(const variable & src_y, - const variable & src_uv) { +template +std::tuple, jit_kernel::variable, jit_kernel::variable> +JitConverter::load_yuv(const variable& src_y, const variable& src_uv) { auto y = var(); auto uv = var(); @@ -490,29 +476,26 @@ JitConverter::load_yuv(const variable & src_y, src_y += N * sizeof(T); src_uv += N * sizeof(T); - return std::make_tuple(std::move(y), - std::move(std::get<0>(uv_pair)), - std::move(std::get<1>(uv_pair))); + return std::make_tuple(std::move(y), std::move(std::get<0>(uv_pair)), std::move(std::get<1>(uv_pair))); } -template -std::tuple, - jit_kernel::variable> -JitConverter::unpack_uv(const variable & uv) { +template +std::tuple, jit_kernel::variable> JitConverter::unpack_uv( + const variable& uv) { auto u = var(); auto v = var(); - const uint8_t even_mask = 0xA0; // 0b10100000 - const uint8_t odd_mask = 0xF5; // 0b11110101 + const uint8_t even_mask = 0xA0; // 0b10100000 + const uint8_t odd_mask = 0xF5; // 0b11110101 - uni_vshufps(u, uv, uv, even_mask); // u = uv[0,0,2,2,4,4,6,6] - uni_vshufps(v, uv, uv, odd_mask); // v = uv[1,1,3,3,5,5,7,7] + uni_vshufps(u, uv, uv, even_mask); // u = uv[0,0,2,2,4,4,6,6] + uni_vshufps(v, uv, uv, odd_mask); // v = uv[1,1,3,3,5,5,7,7] return std::make_tuple(std::move(u), std::move(v)); } -template -const jit_uni_converter & jit_converter_create() { +template +const jit_uni_converter& jit_converter_create() { auto createKernel = []() { std::unique_ptr kernel; @@ -540,22 +523,21 @@ const jit_uni_converter & jit_converter_create() { return *kernel; } -template -const jit_uni_converter & jit_converter_get() { +template +const jit_uni_converter& jit_converter_get() { return jit_converter_create(); } -template +template class SinglePlaneConvert : public Converter { public: - SinglePlaneConvert(Node *node) - : Converter(node) { + SinglePlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -574,23 +556,22 @@ class SinglePlaneConvert : public Converter { args.u = args.v = uv + batch * stride_uv + (h / 2) * width; args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; -template +template class TwoPlaneConvert : public Converter { public: - TwoPlaneConvert(Node *node) - : Converter(node) { + TwoPlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM]; @@ -609,46 +590,43 @@ class TwoPlaneConvert : public Converter { args.u = args.v = uv + batch * stride_uv + (h / 2) * width; args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; #endif -} // namespace nv12 +} // namespace nv12 namespace i420 { -ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node *node) { - const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 +ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node* node) { + const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 - const ov::element::Type precision = node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 - ? ov::element::u8 - : ov::element::f32; + const ov::element::Type precision = + node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 ? ov::element::u8 : ov::element::f32; ColorConvert::Converter::PrimitiveDescs descs; - descs.emplace_back(std::vector { node->getOriginalInputsNumber(), { layout, precision } }, - std::vector { { layout, precision } }, - mayiuse(cpu_isa_t::sse41) - ? impl_desc_type::jit_uni - : impl_desc_type::ref, - true); + descs.emplace_back(std::vector{node->getOriginalInputsNumber(), {layout, precision}}, + std::vector{{layout, precision}}, + mayiuse(cpu_isa_t::sse41) ? impl_desc_type::jit_uni : impl_desc_type::ref, + true); return descs; } -template +template class SinglePlaneConvert; -template +template class ThreePlaneConvert; class RefConverter : public Converter { public: - RefConverter(Node *node); + RefConverter(Node* node); protected: - template + template void convert(const T* y, const T* u, const T* v, @@ -660,15 +638,14 @@ class RefConverter : public Converter { size_t stride_uv); }; -RefConverter::RefConverter(Node *node) - : Converter(node) { - if (node->getOriginalInputsNumber() != (singlePlane() ? 1: 3)) +RefConverter::RefConverter(Node* node) : Converter(node) { + if (node->getOriginalInputsNumber() != (singlePlane() ? 1 : 3)) OPENVINO_THROW("I420Converter node has incorrect number of inputs"); if (!node->getOriginalOutputsNumber()) OPENVINO_THROW("I420Converter node has incorrect number of outputs"); } -template +template void RefConverter::convert(const T* y, const T* u, const T* v, @@ -699,13 +676,13 @@ void RefConverter::convert(const T* y, }); } -template +template class SinglePlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -716,22 +693,17 @@ class SinglePlaneConvert : public RefConverter { const T* v = y + 5 * width * height / 4; T* dst = static_cast(output(0)); - convert(y, u, v, dst, - batch_size, - height, - width, - height * width * 3 / 2, - height * width * 3 / 2); + convert(y, u, v, dst, batch_size, height, width, height * width * 3 / 2, height * width * 3 / 2); } }; -template +template class ThreePlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const T* y = static_cast(input(0)); const T* u = static_cast(input(1)); @@ -742,34 +714,25 @@ class ThreePlaneConvert : public RefConverter { const size_t height = dims[H_DIM]; const size_t width = dims[W_DIM]; - convert(y, u, v, dst, - batch_size, - height, - width, - height * width, - height * width / 4); + convert(y, u, v, dst, batch_size, height, width, height * width, height * width / 4); } }; #if defined(OPENVINO_ARCH_X86_64) -template +template class JitConverter; -template +template class JitConverter : public jit_uni_converter { private: void generate() override; - std::tuple, - variable, - variable> - load_yuv(const variable & src_y, - const variable & src_u, - const variable & src_v); - void unpack_uv(const variable & u, - const variable & v); + std::tuple, variable, variable> load_yuv(const variable& src_y, + const variable& src_u, + const variable& src_v); + void unpack_uv(const variable& u, const variable& v); }; -template +template void JitConverter::generate() { preamble(); @@ -781,7 +744,7 @@ void JitConverter::generate() { auto width = arg(&Params::width); auto colorFormat = arg(&Params::colorFormat); - static const float data[8] = { 16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f }; + static const float data[8] = {16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f}; _consts = data; const size_t reg_capacity_log = static_cast(std::logb(N)); @@ -789,26 +752,29 @@ void JitConverter::generate() { width >>= reg_capacity_log; - foreach(0, width, [&](const Reg64 & idx) { + foreach (0, width, [&](const Reg64& idx) { auto yuv = load_yuv(src_y, src_u, src_v); // Aliases - const auto & y = std::get<0>(yuv); - const auto & u = std::get<1>(yuv); - const auto & v = std::get<2>(yuv); + const auto& y = std::get<0>(yuv); + const auto& u = std::get<1>(yuv); + const auto& v = std::get<2>(yuv); yuv_to_rgb(y, u, v, colorFormat, std::is_integral::value); - store(dst, y); dst += step; - store(dst, u); dst += step; - store(dst, v); dst += step; - }); + store(dst, y); + dst += step; + store(dst, u); + dst += step; + store(dst, v); + dst += step; + }) + ; mov(width, argPtr(&Params::width)); width &= N - 1; - _if(width != 0) - ._then([&] { + _if(width != 0)._then([&] { auto y = var(); auto u = var(); auto v = var(); @@ -829,13 +795,11 @@ void JitConverter::generate() { postamble(); } -template -std::tuple, - jit_kernel::variable, - jit_kernel::variable> -JitConverter::load_yuv(const variable & src_y, - const variable & src_u, - const variable & src_v) { +template +std::tuple, jit_kernel::variable, jit_kernel::variable> +JitConverter::load_yuv(const variable& src_y, + const variable& src_u, + const variable& src_v) { auto y = var(); auto u = var(); auto v = var(); @@ -853,16 +817,15 @@ JitConverter::load_yuv(const variable & src_y, return std::make_tuple(std::move(y), std::move(u), std::move(v)); } -template -void JitConverter::unpack_uv(const variable & u, - const variable & v) { - static const uint8_t order[] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 }; +template +void JitConverter::unpack_uv(const variable& u, const variable& v) { + static const uint8_t order[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}; u.permute(order); v.permute(order); } -template -const jit_uni_converter & jit_converter_create() { +template +const jit_uni_converter& jit_converter_create() { auto createKernel = []() { std::unique_ptr kernel; @@ -890,22 +853,21 @@ const jit_uni_converter & jit_converter_create() { return *kernel; } -template -const jit_uni_converter & jit_converter_get() { +template +const jit_uni_converter& jit_converter_get() { return jit_converter_create(); } -template +template class SinglePlaneConvert : public Converter { public: - SinglePlaneConvert(Node *node) - : Converter(node) { + SinglePlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -926,23 +888,22 @@ class SinglePlaneConvert : public Converter { args.v = v + batch * stride_uv + (h / 2) * (width / 2); args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; -template +template class ThreePlaneConvert : public Converter { public: - ThreePlaneConvert(Node *node) - : Converter(node) { + ThreePlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const T* y = static_cast(input(0)); const T* u = static_cast(input(1)); @@ -963,20 +924,19 @@ class ThreePlaneConvert : public Converter { args.v = v + batch * stride_uv + (h / 2) * (width / 2); args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; #endif -} // namespace i420 +} // namespace i420 -} // namespace +} // namespace -ColorConvert::Converter::Converter(Node *node, const ColorFormat & colorFormat) - : _node(node) - , _colorFormat(colorFormat) { -} +ColorConvert::Converter::Converter(Node* node, const ColorFormat& colorFormat) + : _node(node), + _colorFormat(colorFormat) {} ov::element::Type ColorConvert::Converter::inputPrecision(size_t idx) const { return _node->getParentEdgeAt(idx)->getMemory().getDesc().getPrecision(); @@ -986,15 +946,15 @@ ov::element::Type ColorConvert::Converter::outputPrecision(size_t idx) const { return _node->getChildEdgeAt(idx)->getMemory().getDesc().getPrecision(); } -const void * ColorConvert::Converter::input(size_t idx) const { +const void* ColorConvert::Converter::input(size_t idx) const { return _node->getSrcDataAtPort(idx); } -void * ColorConvert::Converter::output(size_t idx) const { +void* ColorConvert::Converter::output(size_t idx) const { return _node->getDstDataAtPort(idx); } -const VectorDims & ColorConvert::Converter::inputDims(size_t idx) const { +const VectorDims& ColorConvert::Converter::inputDims(size_t idx) const { return _node->getParentEdgeAt(idx)->getMemory().getStaticDims(); } @@ -1019,42 +979,42 @@ void ColorConvert::initSupportedPrimitiveDescriptors() { return; switch (algorithm) { - case Algorithm::ColorConvertNV12toRGB: - case Algorithm::ColorConvertNV12toBGR: { - for (const auto &desc : nv12::supportedPrimitiveDescs(this)) { - const auto & inPortConfigs = std::get<0>(desc); - const auto & outPortConfigs = std::get<1>(desc); - const auto implType = std::get<2>(desc); - addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); - } - initSupportedNV12Impls(); - break; + case Algorithm::ColorConvertNV12toRGB: + case Algorithm::ColorConvertNV12toBGR: { + for (const auto& desc : nv12::supportedPrimitiveDescs(this)) { + const auto& inPortConfigs = std::get<0>(desc); + const auto& outPortConfigs = std::get<1>(desc); + const auto implType = std::get<2>(desc); + addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); } - case Algorithm::ColorConvertI420toRGB: - case Algorithm::ColorConvertI420toBGR: { - for (const auto &desc : i420::supportedPrimitiveDescs(this)) { - const auto & inPortConfigs = std::get<0>(desc); - const auto & outPortConfigs = std::get<1>(desc); - const auto implType = std::get<2>(desc); - addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); - } - initSupportedI420Impls(); - break; + initSupportedNV12Impls(); + break; + } + case Algorithm::ColorConvertI420toRGB: + case Algorithm::ColorConvertI420toBGR: { + for (const auto& desc : i420::supportedPrimitiveDescs(this)) { + const auto& inPortConfigs = std::get<0>(desc); + const auto& outPortConfigs = std::get<1>(desc); + const auto implType = std::get<2>(desc); + addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); } - default: - break; + initSupportedI420Impls(); + break; + } + default: + break; } } void ColorConvert::initSupportedNV12Impls() { - #define SUPPORTED_IMPL(Impl, type, desc_type) \ - [](Node *node) { \ - return new nv12::Impl(node); \ - }; +#define SUPPORTED_IMPL(Impl, type, desc_type) \ + [](Node* node) { \ + return new nv12::Impl(node); \ + }; // ref { - auto &impls = _supportedImpls[impl_desc_type::ref][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::ref][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, ref); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(TwoPlaneConvert, uint8_t, ref); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, ref); @@ -1064,25 +1024,25 @@ void ColorConvert::initSupportedNV12Impls() { #if defined(OPENVINO_ARCH_X86_64) // jit_uni { - auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(TwoPlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni); impls[ov::element::Type_t::f32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, jit_uni); } #endif - #undef SUPPORTED_IMPL +#undef SUPPORTED_IMPL } void ColorConvert::initSupportedI420Impls() { - #define SUPPORTED_IMPL(Impl, type, desc_type) \ - [](Node *node) { \ - return new i420::Impl(node); \ - }; +#define SUPPORTED_IMPL(Impl, type, desc_type) \ + [](Node* node) { \ + return new i420::Impl(node); \ + }; // ref { - auto &impls = _supportedImpls[impl_desc_type::ref][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::ref][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, ref); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(ThreePlaneConvert, uint8_t, ref); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, ref); @@ -1092,32 +1052,29 @@ void ColorConvert::initSupportedI420Impls() { #if defined(OPENVINO_ARCH_X86_64) // jit_uni { - auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(ThreePlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni); impls[ov::element::Type_t::f32][false] = SUPPORTED_IMPL(ThreePlaneConvert, float, jit_uni); } #endif - #undef SUPPORTED_IMPL +#undef SUPPORTED_IMPL } void ColorConvert::createPrimitive() { - const NodeDesc *desc = getSelectedPrimitiveDescriptor(); + const NodeDesc* desc = getSelectedPrimitiveDescriptor(); if (!desc) OPENVINO_THROW(getTypeStr() + " node with name '" + getName() + "' ", "no optimal primitive descriptor selected"); if (!_impl) { - const auto & cfg = desc->getConfig(); + const auto& cfg = desc->getConfig(); const auto precision = cfg.inConfs[0].getMemDesc()->getPrecision(); const bool isSinglePlane = cfg.inConfs.size() == 1; - _impl = std::unique_ptr(_supportedImpls - .at(desc->getImplementationType()) - .at(algorithm) - .at(precision) - .at(isSinglePlane)(this)); + _impl = std::unique_ptr( + _supportedImpls.at(desc->getImplementationType()).at(algorithm).at(precision).at(isSinglePlane)(this)); } } @@ -1139,6 +1096,6 @@ void ColorConvert::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/color_convert.h b/src/plugins/intel_cpu/src/nodes/color_convert.h index 19df1209dd4bab..9bd27c7cf9dffa 100644 --- a/src/plugins/intel_cpu/src/nodes/color_convert.h +++ b/src/plugins/intel_cpu/src/nodes/color_convert.h @@ -5,10 +5,11 @@ #pragma once #include -#include + +#include #include #include -#include +#include namespace ov { namespace intel_cpu { @@ -35,11 +36,11 @@ class ColorConvert : public Node { void initSupportedI420Impls(); private: - using ConverterBuilder = std::function; - using SupportedImpls = multidim_map; + using SupportedImpls = multidim_map; std::unique_ptr _impl; @@ -48,10 +49,11 @@ class ColorConvert : public Node { class ColorConvert::Converter { public: - using PrimitiveDescs = std::vector, // Input port configurator - std::vector, // Output port configurator - impl_desc_type, // Implementation type - bool>>; // // true - SinglePlaneConvert, false - TwoPlaneConvert/ThreePlaneConvert + using PrimitiveDescs = + std::vector, // Input port configurator + std::vector, // Output port configurator + impl_desc_type, // Implementation type + bool>>; // // true - SinglePlaneConvert, false - TwoPlaneConvert/ThreePlaneConvert using Shapes = std::vector; static constexpr size_t N_DIM = 0; @@ -61,20 +63,20 @@ class ColorConvert::Converter { using ColorFormat = std::array; - Converter(Node *node, const ColorFormat & colorFormat); + Converter(Node* node, const ColorFormat& colorFormat); virtual ~Converter() = default; ov::element::Type inputPrecision(size_t idx) const; ov::element::Type outputPrecision(size_t idx) const; - const void * input(size_t idx) const; - void * output(size_t idx) const; - const VectorDims & inputDims(size_t idx) const; + const void* input(size_t idx) const; + void* output(size_t idx) const; + const VectorDims& inputDims(size_t idx) const; virtual void execute(dnnl::stream strm) = 0; protected: - Node *_node; - ColorFormat _colorFormat; // RGB: {0,1,2}, BGR: {2,1,0} + Node* _node; + ColorFormat _colorFormat; // RGB: {0,1,2}, BGR: {2,1,0} }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp index a7d3adc50d62e3..5887900ce8fa9e 100644 --- a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp @@ -3,26 +3,26 @@ // #include "arbitrary_order_desc_creator.h" + #include "utils/general_utils.h" namespace ov { namespace intel_cpu { -ArbitraryOrderDescCreator::ArbitraryOrderDescCreator(VectorDims order) : - m_order(std::move(order)) { +ArbitraryOrderDescCreator::ArbitraryOrderDescCreator(VectorDims order) : m_order(std::move(order)) { OPENVINO_ASSERT(std::adjacent_find(m_order.begin(), m_order.end()) == m_order.end(), - "Can't construct ArbitraryOrderDescCreator, order vector contains repetitive elements", - vec2str(m_order)); + "Can't construct ArbitraryOrderDescCreator, order vector contains repetitive elements", + vec2str(m_order)); } -CpuBlockedMemoryDesc -ArbitraryOrderDescCreator::createDesc(const ov::element::Type& precision, const Shape& srcShape) const { +CpuBlockedMemoryDesc ArbitraryOrderDescCreator::createDesc(const ov::element::Type& precision, + const Shape& srcShape) const { auto&& dims = srcShape.getDims(); OPENVINO_ASSERT(dims.size() == m_order.size(), - "Couldn't create a tensor descriptor, shape and order size mismatch. Shape: ", - vec2str(dims), - " order: ", - vec2str(m_order)); + "Couldn't create a tensor descriptor, shape and order size mismatch. Shape: ", + vec2str(dims), + " order: ", + vec2str(m_order)); VectorDims blkDims(dims.size()); for (size_t i = 0; i < dims.size(); ++i) { @@ -36,5 +36,5 @@ size_t ArbitraryOrderDescCreator::getMinimalRank() const { return m_order.size(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h index aaf5a7d5560799..c7341169fd9187 100644 --- a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h +++ b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h @@ -20,5 +20,5 @@ class ArbitraryOrderDescCreator : public BlockedDescCreator { VectorDims m_order; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp index 88c351ecafbdc1..a7398cac1e9940 100644 --- a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp @@ -3,9 +3,8 @@ // #include "blocked_desc_creator.h" -#include - +#include namespace ov { namespace intel_cpu { @@ -15,17 +14,19 @@ constexpr size_t channelsPos = 1lu; class PlainFormatCreator : public BlockedDescCreator { public: - CpuBlockedMemoryDesc createDesc(const ov::element::Type &precision, const Shape& srcShape) const override { + CpuBlockedMemoryDesc createDesc(const ov::element::Type& precision, const Shape& srcShape) const override { VectorDims order(srcShape.getRank()); std::iota(order.begin(), order.end(), 0); return CpuBlockedMemoryDesc(precision, srcShape, srcShape.getDims(), order); } - size_t getMinimalRank() const override { return 0lu; } + size_t getMinimalRank() const override { + return 0lu; + } }; class PerChannelCreator : public BlockedDescCreator { public: - CpuBlockedMemoryDesc createDesc(const ov::element::Type &precision, const Shape& srcShape) const override { + CpuBlockedMemoryDesc createDesc(const ov::element::Type& precision, const Shape& srcShape) const override { VectorDims order(srcShape.getRank()); std::iota(order.begin(), order.end(), 0); VectorDims blkDims = srcShape.getDims(); @@ -41,7 +42,9 @@ class PerChannelCreator : public BlockedDescCreator { return CpuBlockedMemoryDesc(precision, srcShape, blkDims, order); } - size_t getMinimalRank() const override { return 3lu; } + size_t getMinimalRank() const override { + return 3lu; + } }; class ChannelBlockedCreator : public BlockedDescCreator { @@ -64,24 +67,27 @@ class ChannelBlockedCreator : public BlockedDescCreator { return CpuBlockedMemoryDesc(precision, srcShape, blkDims, order); } - size_t getMinimalRank() const override { return 3lu; } + size_t getMinimalRank() const override { + return 3lu; + } private: size_t _blockSize; }; -} // namespace +} // namespace const BlockedDescCreator::CreatorsMap& BlockedDescCreator::getCommonCreators() { - static const CreatorsMap map{ { LayoutType::nspc, CreatorConstPtr(new PerChannelCreator) }, - { LayoutType::nCsp8c, CreatorConstPtr(new ChannelBlockedCreator(8)) }, - { LayoutType::nCsp16c, CreatorConstPtr(new ChannelBlockedCreator(16)) }, - { LayoutType::ncsp, CreatorConstPtr(new PlainFormatCreator) } }; + static const CreatorsMap map{{LayoutType::nspc, CreatorConstPtr(new PerChannelCreator)}, + {LayoutType::nCsp8c, CreatorConstPtr(new ChannelBlockedCreator(8))}, + {LayoutType::nCsp16c, CreatorConstPtr(new ChannelBlockedCreator(16))}, + {LayoutType::ncsp, CreatorConstPtr(new PlainFormatCreator)}}; return map; } -std::pair -BlockedDescCreator::makeFilteredRange(const CreatorsMap &map, unsigned int rank) { +std::pair BlockedDescCreator::makeFilteredRange( + const CreatorsMap& map, + unsigned int rank) { auto rankFilter = [rank](const CreatorsMap::value_type& item) { if (item.second->getMinimalRank() > rank) { return false; @@ -94,8 +100,10 @@ BlockedDescCreator::makeFilteredRange(const CreatorsMap &map, unsigned int rank) return std::make_pair(first, last); } -std::pair -BlockedDescCreator::makeFilteredRange(const CreatorsMap& map, unsigned rank, const std::vector& supportedTypes) { +std::pair BlockedDescCreator::makeFilteredRange( + const CreatorsMap& map, + unsigned rank, + const std::vector& supportedTypes) { unsigned bitMask = 0ul; for (auto& item : supportedTypes) { bitMask |= 1 << static_cast(item); @@ -116,12 +124,13 @@ BlockedDescCreator::makeFilteredRange(const CreatorsMap& map, unsigned rank, con return std::make_pair(first, last); } -std::pair -BlockedDescCreator::makeFilteredRange(const CreatorsMap &map, BlockedDescCreator::Predicate predicate) { +std::pair BlockedDescCreator::makeFilteredRange( + const CreatorsMap& map, + BlockedDescCreator::Predicate predicate) { auto first = CreatorsMapFilterConstIterator(std::move(predicate), map.begin(), map.end()); auto last = first.end(); return std::make_pair(first, last); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h index 1fd7a02dff984b..9f8b15b430c727 100644 --- a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h +++ b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h @@ -5,6 +5,7 @@ #pragma once #include + #include "cpu_shape.h" #include "memory_desc/cpu_blocked_memory_desc.h" @@ -22,15 +23,18 @@ class BlockedDescCreator { public: static const CreatorsMap& getCommonCreators(); - static std::pair - makeFilteredRange(const CreatorsMap &map, unsigned rank); + static std::pair makeFilteredRange( + const CreatorsMap& map, + unsigned rank); static std::pair makeFilteredRange(const CreatorsMap& map, unsigned rank, const std::vector& supportedTypes); - static std::pair - makeFilteredRange(const CreatorsMap& map, Predicate predicate); + static std::pair makeFilteredRange( + const CreatorsMap& map, + Predicate predicate); virtual CpuBlockedMemoryDesc createDesc(const ov::element::Type& precision, const Shape& srcShape) const = 0; - std::shared_ptr createSharedDesc(const ov::element::Type& precision, const Shape& srcShape) const { + std::shared_ptr createSharedDesc(const ov::element::Type& precision, + const Shape& srcShape) const { return std::make_shared(createDesc(precision, srcShape)); } @@ -49,7 +53,10 @@ class CreatorsMapFilterConstIterator { typedef std::function predicate_type; public: - CreatorsMapFilterConstIterator(predicate_type filter, Iterator begin, Iterator end) : _iter(begin), _end(end), _filter(std::move(filter)) { + CreatorsMapFilterConstIterator(predicate_type filter, Iterator begin, Iterator end) + : _iter(begin), + _end(end), + _filter(std::move(filter)) { while (_iter != _end && !_filter(*_iter)) { ++_iter; } @@ -93,5 +100,5 @@ class CreatorsMapFilterConstIterator { predicate_type _filter; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp index ad0738e9d57558..a0590827006eb4 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp @@ -5,16 +5,16 @@ #include "cpu_convert.h" #include "cpu_memcpy.h" -#include "utils/bfloat16.hpp" #include "openvino/core/type/nf4.hpp" +#include "utils/bfloat16.hpp" #if defined(OPENVINO_ARCH_X86_64) -#include "nodes/kernels/x64/jit_kernel.hpp" +# include "nodes/kernels/x64/jit_kernel.hpp" #else -#include "cpu_memory.h" -#include "openvino/core/type/element_type_traits.hpp" -#include "selective_build.h" -#include "utils/general_utils.h" +# include "cpu_memory.h" +# include "openvino/core/type/element_type_traits.hpp" +# include "selective_build.h" +# include "utils/general_utils.h" #endif namespace ov { @@ -28,16 +28,12 @@ using namespace dnnl::impl::cpu::x64; using namespace Xbyak; template -void convert_vec(jit_generator & gen, - const RegExp & src, - const RegExp & dst); +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst); template <> -void convert_vec(jit_generator & gen, - const RegExp & src, - const RegExp & dst) { - auto const & f16vec = gen.xmm3; - auto const & f32vec = gen.ymm4; +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f16vec = gen.xmm3; + auto const& f32vec = gen.ymm4; gen.movdqu(f16vec, gen.xword[src]); gen.vcvtph2ps(f32vec, f16vec); @@ -45,11 +41,9 @@ void convert_vec(jit_generator & gen, } template <> -void convert_vec(jit_generator & gen, - const RegExp & src, - const RegExp & dst) { - auto const & f16vec = gen.xmm3; - auto const & f32vec = gen.ymm4; +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f16vec = gen.xmm3; + auto const& f32vec = gen.ymm4; gen.vmovups(f32vec, gen.yword[src]); gen.vcvtps2ph(f16vec, f32vec, 0); @@ -72,18 +66,18 @@ class jit_convert_array : public jit_kernel { size >>= vlen_log2; - foreach(0, size, [&, this](const Xbyak::Reg64& idx) { + foreach (0, size, [&, this](const Xbyak::Reg64& idx) { _convert_vec(*this, src, dst); src += _src_size * vlen; dst += _dst_size * vlen; - }); + }) + ; mov(size, argPtr(&args_t::count)); size &= vlen - 1; // Tail conversion - _if(size != 0) - ._then([&] { + _if(size != 0)._then([&] { auto tmp = stack(vlen * sizeof(float)); tmp.clear(); @@ -112,24 +106,19 @@ class jit_convert_array : public jit_kernel { typedef void (*fn_t)(const args_t*); - typedef void (*convert_vec_t)(jit_generator &, - const RegExp &, - const RegExp &); + typedef void (*convert_vec_t)(jit_generator&, const RegExp&, const RegExp&); - jit_convert_array(convert_vec_t convert_vec, - size_t src_size, - size_t dst_size) - : jit_kernel(jit_name()) - , _convert_vec(convert_vec) - , _src_size(src_size) - , _dst_size(dst_size) {} + jit_convert_array(convert_vec_t convert_vec, size_t src_size, size_t dst_size) + : jit_kernel(jit_name()), + _convert_vec(convert_vec), + _src_size(src_size), + _dst_size(dst_size) {} - template + template static fn_t get() { - if (mayiuse(cpu_isa_t::avx2) - && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) { + if (mayiuse(cpu_isa_t::avx2) && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) { static jit_convert_array converter(convert_vec, sizeof(src_t), sizeof(dst_t)); - auto & generator = static_cast(converter); + auto& generator = static_cast(converter); generator.create_kernel(); return (fn_t)generator.jit_ker(); } @@ -148,7 +137,7 @@ void jit_convert(const TI* arg, TO* out, size_t count) { static auto converter = jit_impl::get(); if (converter) { - typename jit_impl::args_t args = { arg, out, count }; + typename jit_impl::args_t args = {arg, out, count}; converter(&args); } else { for (size_t i = 0; i < count; ++i) { @@ -179,44 +168,41 @@ struct PrecisionInfo { using value_type = uint8_t; }; -template::value - || std::is_same::value, - float, T>::type> +template ::value || + std::is_same::value, + float, + T>::type> struct Range { - const std::tuple & fit(const ov::element::Type & prec); + const std::tuple& fit(const ov::element::Type& prec); private: - std::tuple _range { - std::numeric_limits::lowest(), - std::numeric_limits::max() - }; + std::tuple _range{std::numeric_limits::lowest(), std::numeric_limits::max()}; }; -template -const std::tuple & Range::fit(const ov::element::Type & prec) { +template +const std::tuple& Range::fit(const ov::element::Type& prec) { if (prec.is_real()) { double lbound, ubound; switch (prec) { - case ov::element::bf16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::f16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::f32: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::f64: - lbound = std::numeric_limits::lowest(); - ubound = std::numeric_limits::max(); - break; - default: - OPENVINO_THROW("Unsupported precision"); + case ov::element::bf16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::f16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::f32: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::f64: + lbound = std::numeric_limits::lowest(); + ubound = std::numeric_limits::max(); + break; + default: + OPENVINO_THROW("Unsupported precision"); } // If U is integral, its range always less than float, so not need update _range // Else it will be overflow, for example static_cast double to int64_t: @@ -224,73 +210,71 @@ const std::tuple & Range::fit(const ov::element::Type & prec) { // double dd_ubound = static_cast(ubbound) // static_cast(dd_ubound) will return -9223372036854775808 if (!std::is_integral::value) { - std::get<0>(_range) = static_cast(std::max(static_cast(std::get<0>(_range)), lbound)); - std::get<1>(_range) = static_cast(std::min(static_cast(std::get<1>(_range)), ubound)); + std::get<0>(_range) = static_cast(std::max(static_cast(std::get<0>(_range)), lbound)); + std::get<1>(_range) = static_cast(std::min(static_cast(std::get<1>(_range)), ubound)); } } else { int64_t lbound; uint64_t ubound; switch (prec) { - case ov::element::boolean: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u8: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i8: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u32: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i32: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u64: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i64: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - default: - OPENVINO_THROW("Unsupported precision"); + case ov::element::boolean: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u8: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i8: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u32: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i32: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u64: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i64: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + default: + OPENVINO_THROW("Unsupported precision"); } - using ltype = typename std::conditional< - std::is_floating_point::value, - double, int64_t>::type; - using utype = typename std::conditional< - std::is_floating_point::value, - double, uint64_t>::type; - std::get<0>(_range) = static_cast(std::max(static_cast(std::get<0>(_range)), static_cast(lbound))); - std::get<1>(_range) = static_cast(std::min(static_cast(std::get<1>(_range)), static_cast(ubound))); + using ltype = typename std::conditional::value, double, int64_t>::type; + using utype = typename std::conditional::value, double, uint64_t>::type; + std::get<0>(_range) = + static_cast(std::max(static_cast(std::get<0>(_range)), static_cast(lbound))); + std::get<1>(_range) = + static_cast(std::min(static_cast(std::get<1>(_range)), static_cast(ubound))); } return _range; } struct ConvertContext { - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; ov::element::Type interimPrc; ov::element::Type dstPrc; bool converted; - template + template std::tuple range() const { Range r; r.fit(interimPrc); @@ -298,20 +282,18 @@ struct ConvertContext { } }; -template +template struct ConvertPrecision; -template +template struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); src_t lbound, ubound; std::tie(lbound, ubound) = ctx.range(); - if (std::is_integral::value - || ctx.interimPrc.is_real() - || std::is_integral::value) { + if (std::is_integral::value || ctx.interimPrc.is_real() || std::is_integral::value) { parallel_for(ctx.size, [&](size_t i) { dst[i] = static_cast(std::max(std::min(src[i], ubound), lbound)); }); @@ -325,11 +307,11 @@ struct ConvertPrecision> { } }; -template<> +template <> struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); if (ctx.interimPrc.is_real()) { parallel_for(ctx.size, [&](size_t i) { @@ -347,11 +329,11 @@ struct ConvertPrecision> { } }; -template<> +template <> struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); if (ctx.interimPrc.is_real()) { parallel_for(ctx.size, [&](size_t i) { @@ -370,11 +352,11 @@ struct ConvertPrecision> { }; #if defined(OPENVINO_ARCH_X86_64) -template +template struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); constexpr size_t batch = 64; const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); @@ -388,16 +370,16 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 tmp[j] = static_cast(std::max(std::min(src[offset + j], ubound), lbound)); - jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 + jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 }); } else if (ctx.interimPrc.is_real()) { parallel_for(iterations, [&](size_t i) { const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); if (std::is_same::type, float>::value) { // fp32 -> fp16 - jit_convert(reinterpret_cast(src) + offset, dst + offset, current_batch_size); + jit_convert(reinterpret_cast(src) + offset, dst + offset, current_batch_size); } else { batch_type tmp; for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 @@ -410,9 +392,9 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 tmp[j] = static_cast(std::trunc(std::max(std::min(src[offset + j], ubound), lbound))); - jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 + jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 }); } @@ -420,11 +402,11 @@ struct ConvertPrecision> { } }; -template +template struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); constexpr size_t batch = 64; const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); @@ -438,8 +420,8 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 - for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t + jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t dst[offset + j] = static_cast(std::max(std::min(tmp[j], ubound), lbound)); }); } else if (ctx.interimPrc.is_real()) { @@ -447,7 +429,7 @@ struct ConvertPrecision> { const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); if (std::is_same::type, float>::value) { // fp16 -> fp32 - jit_convert(src + offset, reinterpret_cast(dst) + offset, current_batch_size); + jit_convert(src + offset, reinterpret_cast(dst) + offset, current_batch_size); } else { batch_type tmp; jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 @@ -460,8 +442,8 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 - for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t + jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t dst[offset + j] = static_cast(std::trunc(std::max(std::min(tmp[j], ubound), lbound))); }); } @@ -470,11 +452,11 @@ struct ConvertPrecision> { } }; -template<> +template <> struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); constexpr size_t batch = 64; const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); @@ -490,10 +472,10 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 - for (size_t j = 0; j < current_batch_size; ++j) // truncate fp32 + jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // truncate fp32 tmp[j] = std::trunc(std::max(std::min(tmp[j], ubound), lbound)); - jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 + jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 }); } @@ -502,7 +484,7 @@ struct ConvertPrecision> { }; #endif -} // namespace +} // namespace #define INTEL_CPU_CVT(ST, DT) \ OV_CASE2(ov::element::ST, \ @@ -510,74 +492,72 @@ struct ConvertPrecision> { PrecisionInfo::value_type, \ PrecisionInfo::value_type) -#define INTEL_CPU_CVT_LIST \ - INTEL_CPU_CVT(u8, i8), INTEL_CPU_CVT(u8, u16), INTEL_CPU_CVT(u8, i16), INTEL_CPU_CVT(u8, u32), \ - INTEL_CPU_CVT(u8, i32), INTEL_CPU_CVT(u8, u64), INTEL_CPU_CVT(u8, i64), INTEL_CPU_CVT(u8, f32), \ - INTEL_CPU_CVT(u8, f16), INTEL_CPU_CVT(u8, bf16), INTEL_CPU_CVT(u8, f64), INTEL_CPU_CVT(u8, boolean), \ - INTEL_CPU_CVT(i8, u8), INTEL_CPU_CVT(i8, u16), INTEL_CPU_CVT(i8, i16), INTEL_CPU_CVT(i8, u32), \ - INTEL_CPU_CVT(i8, i32), INTEL_CPU_CVT(i8, u64), INTEL_CPU_CVT(i8, i64), INTEL_CPU_CVT(i8, f32), \ - INTEL_CPU_CVT(i8, f16), INTEL_CPU_CVT(i8, bf16), INTEL_CPU_CVT(i8, f64), INTEL_CPU_CVT(i8, boolean), \ - INTEL_CPU_CVT(u16, u8), INTEL_CPU_CVT(u16, i8), INTEL_CPU_CVT(u16, i16), INTEL_CPU_CVT(u16, u32), \ - INTEL_CPU_CVT(u16, i32), INTEL_CPU_CVT(u16, u64), INTEL_CPU_CVT(u16, i64), INTEL_CPU_CVT(u16, f32), \ - INTEL_CPU_CVT(u16, f16), INTEL_CPU_CVT(u16, bf16), INTEL_CPU_CVT(u16, f64), INTEL_CPU_CVT(u16, boolean), \ - INTEL_CPU_CVT(i16, u8), INTEL_CPU_CVT(i16, i8), INTEL_CPU_CVT(i16, u16), INTEL_CPU_CVT(i16, u32), \ - INTEL_CPU_CVT(i16, i32), INTEL_CPU_CVT(i16, u64), INTEL_CPU_CVT(i16, i64), INTEL_CPU_CVT(i16, f32), \ - INTEL_CPU_CVT(i16, f16), INTEL_CPU_CVT(i16, bf16), INTEL_CPU_CVT(i16, f64), INTEL_CPU_CVT(i16, boolean), \ - INTEL_CPU_CVT(u32, u8), INTEL_CPU_CVT(u32, i8), INTEL_CPU_CVT(u32, u16), INTEL_CPU_CVT(u32, i16), \ - INTEL_CPU_CVT(u32, i32), INTEL_CPU_CVT(u32, u64), INTEL_CPU_CVT(u32, i64), INTEL_CPU_CVT(u32, f32), \ - INTEL_CPU_CVT(u32, f16), INTEL_CPU_CVT(u32, bf16), INTEL_CPU_CVT(u32, f64), INTEL_CPU_CVT(u32, boolean), \ - INTEL_CPU_CVT(i32, u8), INTEL_CPU_CVT(i32, i8), INTEL_CPU_CVT(i32, u16), INTEL_CPU_CVT(i32, i16), \ - INTEL_CPU_CVT(i32, u32), INTEL_CPU_CVT(i32, u64), INTEL_CPU_CVT(i32, i64), INTEL_CPU_CVT(i32, f32), \ - INTEL_CPU_CVT(i32, f16), INTEL_CPU_CVT(i32, bf16), INTEL_CPU_CVT(i32, f64), INTEL_CPU_CVT(i32, boolean), \ - INTEL_CPU_CVT(u64, u8), INTEL_CPU_CVT(u64, i8), INTEL_CPU_CVT(u64, u16), INTEL_CPU_CVT(u64, i16), \ - INTEL_CPU_CVT(u64, u32), INTEL_CPU_CVT(u64, i32), INTEL_CPU_CVT(u64, i64), INTEL_CPU_CVT(u64, f32), \ - INTEL_CPU_CVT(u64, f16), INTEL_CPU_CVT(u64, bf16), INTEL_CPU_CVT(u64, f64), INTEL_CPU_CVT(u64, boolean), \ - INTEL_CPU_CVT(i64, u8), INTEL_CPU_CVT(i64, i8), INTEL_CPU_CVT(i64, u16), INTEL_CPU_CVT(i64, i16), \ - INTEL_CPU_CVT(i64, u32), INTEL_CPU_CVT(i64, i32), INTEL_CPU_CVT(i64, u64), INTEL_CPU_CVT(i64, f32), \ - INTEL_CPU_CVT(i64, f16), INTEL_CPU_CVT(i64, bf16), INTEL_CPU_CVT(i64, f64), INTEL_CPU_CVT(i64, boolean), \ - INTEL_CPU_CVT(f32, u8), INTEL_CPU_CVT(f32, i8), INTEL_CPU_CVT(f32, u16), INTEL_CPU_CVT(f32, i16), \ - INTEL_CPU_CVT(f32, u32), INTEL_CPU_CVT(f32, i32), INTEL_CPU_CVT(f32, u64), INTEL_CPU_CVT(f32, i64), \ - INTEL_CPU_CVT(f32, f16), INTEL_CPU_CVT(f32, bf16), INTEL_CPU_CVT(f32, f64), INTEL_CPU_CVT(f32, boolean), \ - INTEL_CPU_CVT(f16, u8), INTEL_CPU_CVT(f16, i8), INTEL_CPU_CVT(f16, u16), INTEL_CPU_CVT(f16, i16), \ - INTEL_CPU_CVT(f16, u32), INTEL_CPU_CVT(f16, i32), INTEL_CPU_CVT(f16, u64), INTEL_CPU_CVT(f16, i64), \ - INTEL_CPU_CVT(f16, f32), INTEL_CPU_CVT(f16, bf16), INTEL_CPU_CVT(f16, f64), INTEL_CPU_CVT(f16, boolean), \ - INTEL_CPU_CVT(bf16, u8), INTEL_CPU_CVT(bf16, i8), INTEL_CPU_CVT(bf16, u16), INTEL_CPU_CVT(bf16, i16), \ - INTEL_CPU_CVT(bf16, u32), INTEL_CPU_CVT(bf16, i32), INTEL_CPU_CVT(bf16, u64), INTEL_CPU_CVT(bf16, i64), \ - INTEL_CPU_CVT(bf16, f32), INTEL_CPU_CVT(bf16, f16), INTEL_CPU_CVT(bf16, f64), INTEL_CPU_CVT(bf16, boolean), \ - INTEL_CPU_CVT(f64, u8), INTEL_CPU_CVT(f64, i8), INTEL_CPU_CVT(f64, u16), INTEL_CPU_CVT(f64, i16), \ - INTEL_CPU_CVT(f64, u32), INTEL_CPU_CVT(f64, i32), INTEL_CPU_CVT(f64, u64), INTEL_CPU_CVT(f64, i64), \ - INTEL_CPU_CVT(f64, f32), INTEL_CPU_CVT(f64, f16), INTEL_CPU_CVT(f64, bf16), INTEL_CPU_CVT(f64, boolean), \ - INTEL_CPU_CVT(boolean, u8), INTEL_CPU_CVT(boolean, i8), INTEL_CPU_CVT(boolean, u16), \ - INTEL_CPU_CVT(boolean, i16), INTEL_CPU_CVT(boolean, u32), INTEL_CPU_CVT(boolean, i32), \ - INTEL_CPU_CVT(boolean, u64), INTEL_CPU_CVT(boolean, i64), INTEL_CPU_CVT(boolean, f32), \ - INTEL_CPU_CVT(boolean, f16), INTEL_CPU_CVT(boolean, bf16), INTEL_CPU_CVT(boolean, f64), INTEL_CPU_CVT(u8, u8), \ - INTEL_CPU_CVT(i8, i8), INTEL_CPU_CVT(u16, u16), INTEL_CPU_CVT(i16, i16), INTEL_CPU_CVT(u32, u32), \ - INTEL_CPU_CVT(i32, i32), INTEL_CPU_CVT(u64, u64), INTEL_CPU_CVT(i64, i64), INTEL_CPU_CVT(f32, f32), \ - INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean) - - -#define INTEL_CPU_CVT_FROM_BIN_LIST \ - INTEL_CPU_CVT(u1, f32), INTEL_CPU_CVT(u1, f16), INTEL_CPU_CVT(u1, bf16), \ - INTEL_CPU_CVT(u1, f64), INTEL_CPU_CVT(u1, i16), INTEL_CPU_CVT(u1, u8), \ - INTEL_CPU_CVT(u1, i8), INTEL_CPU_CVT(u1, u16), INTEL_CPU_CVT(u1, i32), \ - INTEL_CPU_CVT(u1, u32), INTEL_CPU_CVT(u1, i64), INTEL_CPU_CVT(u1, u64), \ - INTEL_CPU_CVT(u1, boolean) +#define INTEL_CPU_CVT_LIST \ + INTEL_CPU_CVT(u8, i8), INTEL_CPU_CVT(u8, u16), INTEL_CPU_CVT(u8, i16), INTEL_CPU_CVT(u8, u32), \ + INTEL_CPU_CVT(u8, i32), INTEL_CPU_CVT(u8, u64), INTEL_CPU_CVT(u8, i64), INTEL_CPU_CVT(u8, f32), \ + INTEL_CPU_CVT(u8, f16), INTEL_CPU_CVT(u8, bf16), INTEL_CPU_CVT(u8, f64), INTEL_CPU_CVT(u8, boolean), \ + INTEL_CPU_CVT(i8, u8), INTEL_CPU_CVT(i8, u16), INTEL_CPU_CVT(i8, i16), INTEL_CPU_CVT(i8, u32), \ + INTEL_CPU_CVT(i8, i32), INTEL_CPU_CVT(i8, u64), INTEL_CPU_CVT(i8, i64), INTEL_CPU_CVT(i8, f32), \ + INTEL_CPU_CVT(i8, f16), INTEL_CPU_CVT(i8, bf16), INTEL_CPU_CVT(i8, f64), INTEL_CPU_CVT(i8, boolean), \ + INTEL_CPU_CVT(u16, u8), INTEL_CPU_CVT(u16, i8), INTEL_CPU_CVT(u16, i16), INTEL_CPU_CVT(u16, u32), \ + INTEL_CPU_CVT(u16, i32), INTEL_CPU_CVT(u16, u64), INTEL_CPU_CVT(u16, i64), INTEL_CPU_CVT(u16, f32), \ + INTEL_CPU_CVT(u16, f16), INTEL_CPU_CVT(u16, bf16), INTEL_CPU_CVT(u16, f64), INTEL_CPU_CVT(u16, boolean), \ + INTEL_CPU_CVT(i16, u8), INTEL_CPU_CVT(i16, i8), INTEL_CPU_CVT(i16, u16), INTEL_CPU_CVT(i16, u32), \ + INTEL_CPU_CVT(i16, i32), INTEL_CPU_CVT(i16, u64), INTEL_CPU_CVT(i16, i64), INTEL_CPU_CVT(i16, f32), \ + INTEL_CPU_CVT(i16, f16), INTEL_CPU_CVT(i16, bf16), INTEL_CPU_CVT(i16, f64), INTEL_CPU_CVT(i16, boolean), \ + INTEL_CPU_CVT(u32, u8), INTEL_CPU_CVT(u32, i8), INTEL_CPU_CVT(u32, u16), INTEL_CPU_CVT(u32, i16), \ + INTEL_CPU_CVT(u32, i32), INTEL_CPU_CVT(u32, u64), INTEL_CPU_CVT(u32, i64), INTEL_CPU_CVT(u32, f32), \ + INTEL_CPU_CVT(u32, f16), INTEL_CPU_CVT(u32, bf16), INTEL_CPU_CVT(u32, f64), INTEL_CPU_CVT(u32, boolean), \ + INTEL_CPU_CVT(i32, u8), INTEL_CPU_CVT(i32, i8), INTEL_CPU_CVT(i32, u16), INTEL_CPU_CVT(i32, i16), \ + INTEL_CPU_CVT(i32, u32), INTEL_CPU_CVT(i32, u64), INTEL_CPU_CVT(i32, i64), INTEL_CPU_CVT(i32, f32), \ + INTEL_CPU_CVT(i32, f16), INTEL_CPU_CVT(i32, bf16), INTEL_CPU_CVT(i32, f64), INTEL_CPU_CVT(i32, boolean), \ + INTEL_CPU_CVT(u64, u8), INTEL_CPU_CVT(u64, i8), INTEL_CPU_CVT(u64, u16), INTEL_CPU_CVT(u64, i16), \ + INTEL_CPU_CVT(u64, u32), INTEL_CPU_CVT(u64, i32), INTEL_CPU_CVT(u64, i64), INTEL_CPU_CVT(u64, f32), \ + INTEL_CPU_CVT(u64, f16), INTEL_CPU_CVT(u64, bf16), INTEL_CPU_CVT(u64, f64), INTEL_CPU_CVT(u64, boolean), \ + INTEL_CPU_CVT(i64, u8), INTEL_CPU_CVT(i64, i8), INTEL_CPU_CVT(i64, u16), INTEL_CPU_CVT(i64, i16), \ + INTEL_CPU_CVT(i64, u32), INTEL_CPU_CVT(i64, i32), INTEL_CPU_CVT(i64, u64), INTEL_CPU_CVT(i64, f32), \ + INTEL_CPU_CVT(i64, f16), INTEL_CPU_CVT(i64, bf16), INTEL_CPU_CVT(i64, f64), INTEL_CPU_CVT(i64, boolean), \ + INTEL_CPU_CVT(f32, u8), INTEL_CPU_CVT(f32, i8), INTEL_CPU_CVT(f32, u16), INTEL_CPU_CVT(f32, i16), \ + INTEL_CPU_CVT(f32, u32), INTEL_CPU_CVT(f32, i32), INTEL_CPU_CVT(f32, u64), INTEL_CPU_CVT(f32, i64), \ + INTEL_CPU_CVT(f32, f16), INTEL_CPU_CVT(f32, bf16), INTEL_CPU_CVT(f32, f64), INTEL_CPU_CVT(f32, boolean), \ + INTEL_CPU_CVT(f16, u8), INTEL_CPU_CVT(f16, i8), INTEL_CPU_CVT(f16, u16), INTEL_CPU_CVT(f16, i16), \ + INTEL_CPU_CVT(f16, u32), INTEL_CPU_CVT(f16, i32), INTEL_CPU_CVT(f16, u64), INTEL_CPU_CVT(f16, i64), \ + INTEL_CPU_CVT(f16, f32), INTEL_CPU_CVT(f16, bf16), INTEL_CPU_CVT(f16, f64), INTEL_CPU_CVT(f16, boolean), \ + INTEL_CPU_CVT(bf16, u8), INTEL_CPU_CVT(bf16, i8), INTEL_CPU_CVT(bf16, u16), INTEL_CPU_CVT(bf16, i16), \ + INTEL_CPU_CVT(bf16, u32), INTEL_CPU_CVT(bf16, i32), INTEL_CPU_CVT(bf16, u64), INTEL_CPU_CVT(bf16, i64), \ + INTEL_CPU_CVT(bf16, f32), INTEL_CPU_CVT(bf16, f16), INTEL_CPU_CVT(bf16, f64), INTEL_CPU_CVT(bf16, boolean), \ + INTEL_CPU_CVT(f64, u8), INTEL_CPU_CVT(f64, i8), INTEL_CPU_CVT(f64, u16), INTEL_CPU_CVT(f64, i16), \ + INTEL_CPU_CVT(f64, u32), INTEL_CPU_CVT(f64, i32), INTEL_CPU_CVT(f64, u64), INTEL_CPU_CVT(f64, i64), \ + INTEL_CPU_CVT(f64, f32), INTEL_CPU_CVT(f64, f16), INTEL_CPU_CVT(f64, bf16), INTEL_CPU_CVT(f64, boolean), \ + INTEL_CPU_CVT(boolean, u8), INTEL_CPU_CVT(boolean, i8), INTEL_CPU_CVT(boolean, u16), \ + INTEL_CPU_CVT(boolean, i16), INTEL_CPU_CVT(boolean, u32), INTEL_CPU_CVT(boolean, i32), \ + INTEL_CPU_CVT(boolean, u64), INTEL_CPU_CVT(boolean, i64), INTEL_CPU_CVT(boolean, f32), \ + INTEL_CPU_CVT(boolean, f16), INTEL_CPU_CVT(boolean, bf16), INTEL_CPU_CVT(boolean, f64), INTEL_CPU_CVT(u8, u8), \ + INTEL_CPU_CVT(i8, i8), INTEL_CPU_CVT(u16, u16), INTEL_CPU_CVT(i16, i16), INTEL_CPU_CVT(u32, u32), \ + INTEL_CPU_CVT(i32, i32), INTEL_CPU_CVT(u64, u64), INTEL_CPU_CVT(i64, i64), INTEL_CPU_CVT(f32, f32), \ + INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean) + +#define INTEL_CPU_CVT_FROM_BIN_LIST \ + INTEL_CPU_CVT(u1, f32), INTEL_CPU_CVT(u1, f16), INTEL_CPU_CVT(u1, bf16), INTEL_CPU_CVT(u1, f64), \ + INTEL_CPU_CVT(u1, i16), INTEL_CPU_CVT(u1, u8), INTEL_CPU_CVT(u1, i8), INTEL_CPU_CVT(u1, u16), \ + INTEL_CPU_CVT(u1, i32), INTEL_CPU_CVT(u1, u32), INTEL_CPU_CVT(u1, i64), INTEL_CPU_CVT(u1, u64), \ + INTEL_CPU_CVT(u1, boolean) struct ConvertFromBinContext { - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; bool converted; }; -template +template struct ConvertFromBinPrecision; -template +template struct ConvertFromBinPrecision> { - void operator()(ConvertFromBinContext &ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertFromBinContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); const size_t nBits = 8; const size_t nBytes = rnd_up(ctx.size, nBits); parallel_for(nBytes, [&](size_t byteIndex) { @@ -590,16 +570,17 @@ struct ConvertFromBinPrecision> { } }; -#define INTEL_CPU_CVT_FROM_4BIT_LIST \ - INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16), INTEL_CPU_CVT(u4, i8), INTEL_CPU_CVT(u4, u8), \ - INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16), INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8), \ - INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16), INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8), INTEL_CPU_CVT(nf4, u8), \ - INTEL_CPU_CVT(f4e2m1, f32), INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8) +#define INTEL_CPU_CVT_FROM_4BIT_LIST \ + INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16), INTEL_CPU_CVT(u4, i8), \ + INTEL_CPU_CVT(u4, u8), INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16), \ + INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8), INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16), \ + INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8), INTEL_CPU_CVT(nf4, u8), INTEL_CPU_CVT(f4e2m1, f32), \ + INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8) struct ConvertFrom4BitContext { ov::element::Type_t inType; - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; bool converted; }; @@ -624,12 +605,12 @@ static int8_t get_u4(const uint8_t& val, bool high) { return high ? (val >> 4) : (val & 0xF); } -template +template struct ConvertFrom4BitPrecision; -template +template struct ConvertFrom4BitPrecision> { - void operator()(ConvertFrom4BitContext &ctx) { + void operator()(ConvertFrom4BitContext& ctx) { auto src = static_cast(ctx.srcPtr); auto dst = static_cast(ctx.dstPtr); if (ctx.inType == ov::element::nf4) { @@ -655,23 +636,23 @@ struct ConvertFrom4BitPrecision> { } }; -#define INTEL_CPU_CVT_FROM_BYTE_FP_LIST \ +#define INTEL_CPU_CVT_FROM_BYTE_FP_LIST \ INTEL_CPU_CVT(f8e8m0, f32), INTEL_CPU_CVT(f8e8m0, bf16), INTEL_CPU_CVT(f8e8m0, f16) struct ConvertFromByteFPContext { ov::element::Type_t inType; - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; bool converted; }; -template +template struct ConvertFromByteFPPrecision; -template +template struct ConvertFromByteFPPrecision> { - void operator()(ConvertFromByteFPContext &ctx) { + void operator()(ConvertFromByteFPContext& ctx) { auto src = static_cast(ctx.srcPtr); auto dst = static_cast(ctx.dstPtr); if (ctx.inType == ov::element::f8e8m0) { @@ -685,12 +666,16 @@ struct ConvertFromByteFPPrecision> { } }; -void cpu_convert(const void *srcPtr, void *dstPtr, ov::element::Type srcPrc, ov::element::Type dstPrc, const size_t size) { +void cpu_convert(const void* srcPtr, + void* dstPtr, + ov::element::Type srcPrc, + ov::element::Type dstPrc, + const size_t size) { cpu_convert(srcPtr, dstPtr, srcPrc, dstPrc, dstPrc, size); } -void cpu_convert(const void *srcPtr, - void *dstPtr, +void cpu_convert(const void* srcPtr, + void* dstPtr, ov::element::Type srcPrc, ov::element::Type interimPrc, ov::element::Type dstPrc, @@ -705,12 +690,12 @@ void cpu_convert(const void *srcPtr, const size_t L2_cache_size = dnnl::utils::get_cache_size(2, true); const size_t totalSize = size * dstPrc.size(); if (srcPrc == element::string) { - auto str_src = reinterpret_cast(srcPtr); - auto str_dst = reinterpret_cast(dstPtr); + auto str_src = reinterpret_cast(srcPtr); + auto str_dst = reinterpret_cast(dstPtr); std::copy(str_src, str_src + size, str_dst); } else if (totalSize >= L2_cache_size) { - auto src = static_cast(srcPtr); - auto dst = static_cast(dstPtr); + auto src = static_cast(srcPtr); + auto dst = static_cast(dstPtr); parallel_nt(0, [&](const size_t ithr, const size_t nthr) { size_t start = 0, end = 0; splitter(totalSize, nthr, ithr, start, end); @@ -728,12 +713,7 @@ void cpu_convert(const void *srcPtr, "> precision to: ", dstPrc, ". Not implemented."); - ConvertFromBinContext ctx { - srcPtr, - dstPtr, - size, - false - }; + ConvertFromBinContext ctx{srcPtr, dstPtr, size, false}; OV_SWITCH(intel_cpu, ConvertFromBinPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BIN_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", @@ -749,18 +729,15 @@ void cpu_convert(const void *srcPtr, OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); } else if (srcPrc.bitwidth() == 8u && srcPrc.is_real()) { ConvertFromByteFPContext ctx{srcPrc, srcPtr, dstPtr, size, false}; - OV_SWITCH(intel_cpu, ConvertFromByteFPPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BYTE_FP_LIST); + OV_SWITCH(intel_cpu, + ConvertFromByteFPPrecision, + ctx, + std::tie(srcPrc, dstPrc), + INTEL_CPU_CVT_FROM_BYTE_FP_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); } else { - ConvertContext ctx { - srcPtr, - dstPtr, - size, - interimPrc, - dstPrc, - false - }; + ConvertContext ctx{srcPtr, dstPtr, size, interimPrc, dstPrc, false}; OV_SWITCH(intel_cpu, ConvertPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); @@ -773,7 +750,7 @@ struct isSupportedContext { template struct isSupported { - void operator()(isSupportedContext &ctx) { + void operator()(isSupportedContext& ctx) { ctx.isSupported = true; } }; @@ -790,5 +767,5 @@ bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc) { #undef INTEL_CPU_CVT #undef INTEL_CPU_CVT_LIST -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h index 8390849ff8adc7..11228dbd1dcfdb 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h @@ -22,8 +22,8 @@ namespace intel_cpu { * number of elements in buffers to be converted * @return none. */ -void cpu_convert(const void *srcPtr, - void *dstPtr, +void cpu_convert(const void* srcPtr, + void* dstPtr, ov::element::Type srcPrc, ov::element::Type dstPrc, const size_t size); @@ -45,14 +45,14 @@ void cpu_convert(const void *srcPtr, * number of elements in buffers to be converted * @return none. */ -void cpu_convert(const void *srcPtr, - void *dstPtr, +void cpu_convert(const void* srcPtr, + void* dstPtr, ov::element::Type srcPrc, ov::element::Type interimPrc, ov::element::Type dstPrc, const size_t size); - bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc); +bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h b/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h old mode 100755 new mode 100644 index 95b0267bd4757c..e827d35a11c2ad --- a/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h @@ -5,8 +5,9 @@ #pragma once #include -#include "openvino/core/parallel.hpp" + #include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -36,8 +37,7 @@ inline void cpu_memcpy(void* dst, const void* src, size_t count) { } inline int cpu_memcpy_s(void* dst, size_t dst_size, const void* src, size_t count) { - if (!src || - count > dst_size || + if (!src || count > dst_size || count > (dst > src ? ((uintptr_t)dst - (uintptr_t)src) : ((uintptr_t)src - (uintptr_t)dst))) { // zero out dest if error detected std::memset(dst, 0, dst_size); @@ -55,8 +55,8 @@ inline int cpu_memcpy_s(void* dst, size_t dst_size, const void* src, size_t coun inline void cpu_parallel_memcpy(void* dst, const void* src, size_t count) { const size_t l2_cache_size = dnnl::utils::get_cache_size(2, true); if (count >= l2_cache_size) { - auto src_int8 = static_cast(src); - auto dst_int8 = static_cast(dst); + auto src_int8 = static_cast(src); + auto dst_int8 = static_cast(dst); parallel_nt(0, [&](const size_t ithr, const size_t nthr) { size_t start = 0, end = 0; splitter(count, nthr, ithr, start, end); @@ -67,5 +67,5 @@ inline void cpu_parallel_memcpy(void* dst, const void* src, size_t count) { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/defs.h b/src/plugins/intel_cpu/src/nodes/common/defs.h index 6d8574de0939a4..a8a07a2cc8942a 100644 --- a/src/plugins/intel_cpu/src/nodes/common/defs.h +++ b/src/plugins/intel_cpu/src/nodes/common/defs.h @@ -4,10 +4,10 @@ #pragma once -#if defined (HAVE_SSE) || defined (HAVE_AVX2) -# if defined (_WIN32) -# include -# else -# include -# endif +#if defined(HAVE_SSE) || defined(HAVE_AVX2) +# if defined(_WIN32) +# include +# else +# include +# endif #endif diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp index 51aa54c2f50463..695fdbe823ea15 100644 --- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp @@ -18,7 +18,9 @@ DnnlExecutor::DnnlExecutor(const dnnl::primitive_desc& pd) { DnnlExecutor::IntermReorder::IntermReorder(const dnnl::memory::desc& descSrc, const dnnl::memory::desc& descDst, - const dnnl::engine& engine) : m_descSrc(descSrc), m_descDst(descDst) { + const dnnl::engine& engine) + : m_descSrc(descSrc), + m_descDst(descDst) { auto reorderPd = dnnl::reorder::primitive_desc(engine, descSrc, engine, descDst); m_reorder = dnnl::reorder(reorderPd); } @@ -36,7 +38,7 @@ void DnnlExecutor::exec(const std::unordered_map& primArgs, d } void DnnlExecutor::reorder_exec(std::unordered_map primArgs, dnnl::stream strm) { - for (auto &inReorder : inputReorders) { + for (auto& inReorder : inputReorders) { if (primArgs.count(inReorder.first)) { dnnl::memory memDst(inReorder.second.getDstDesc(), strm.get_engine()); inReorder.second.exec(primArgs[inReorder.first], memDst, strm); @@ -46,17 +48,19 @@ void DnnlExecutor::reorder_exec(std::unordered_map primArgs, } } std::unordered_map outputMem; - for (auto &outReorder : outputReorders) { + for (auto& outReorder : outputReorders) { if (primArgs.count(outReorder.first)) { dnnl::memory memSrc(outReorder.second.getSrcDesc(), strm.get_engine()); outputMem[outReorder.first] = primArgs[outReorder.first]; primArgs[outReorder.first] = memSrc; } else { - OPENVINO_THROW("DnnlExecutor has reorder for output ", outReorder.first, ", but doesn't have destination memory"); + OPENVINO_THROW("DnnlExecutor has reorder for output ", + outReorder.first, + ", but doesn't have destination memory"); } } execPrim.execute(strm, primArgs); - for (auto &outReorder : outputReorders) { + for (auto& outReorder : outputReorders) { outReorder.second.exec(primArgs[outReorder.first], outputMem[outReorder.first], strm); } } @@ -79,4 +83,4 @@ impl_desc_type DnnlExecutor::getImplementationType() const { } } // namespace intel_cpu -} // namespace ov +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h index 3cc6749857816c..32739a38d37028 100644 --- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h +++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h @@ -6,74 +6,79 @@ #include #include + #include "memory_desc/dnnl_memory_desc.h" namespace ov { namespace intel_cpu { class DnnlExecutor { - protected: - class IntermReorder { - public: - IntermReorder(const dnnl::memory::desc& descSrc, const dnnl::memory::desc& descDst, const dnnl::engine& engine); - void exec(dnnl::memory& memSrc, dnnl::memory& memDst, dnnl::stream strm); - const dnnl::memory::desc& getSrcDesc() const { return m_descSrc; } - const dnnl::memory::desc& getDstDesc() const { return m_descDst; } - - private: - dnnl::reorder m_reorder; - dnnl::memory::desc m_descSrc; - dnnl::memory::desc m_descDst; - }; - +protected: + class IntermReorder { public: - explicit DnnlExecutor(const dnnl::primitive_desc& pd); - void exec(const std::unordered_map& primArgs, dnnl::stream strm); - bool needReordering() const; - virtual ~DnnlExecutor() = default; - dnnl::primitive getExecPrim() const; - const_dnnl_primitive_desc_t getPrimitiveDesc() const; - impl_desc_type getImplementationType() const; - - DnnlMemoryDescPtr getSrcDesc() const { - return src_md; + IntermReorder(const dnnl::memory::desc& descSrc, const dnnl::memory::desc& descDst, const dnnl::engine& engine); + void exec(dnnl::memory& memSrc, dnnl::memory& memDst, dnnl::stream strm); + const dnnl::memory::desc& getSrcDesc() const { + return m_descSrc; } - DnnlMemoryDescPtr getWeightDesc() const { - return wghts_md; - } - DnnlMemoryDescPtr getDstDesc() const { - return dst_md; - } - DnnlMemoryDescPtr getScratchPadDesc() const { - return scrch_md; + const dnnl::memory::desc& getDstDesc() const { + return m_descDst; } - const dnnl::memory::desc& getDnnlSrcDesc() const { - return src_md->getDnnlDesc(); - } - const dnnl::memory::desc& getDnnlWeightDesc() const { - return wghts_md->getDnnlDesc(); - } - const dnnl::memory::desc& getDnnlDstDesc() const { - return dst_md->getDnnlDesc(); - } - const dnnl::memory::desc& getDnnlScratchPadDesc() const { - return scrch_md->getDnnlDesc(); - } + private: + dnnl::reorder m_reorder; + dnnl::memory::desc m_descSrc; + dnnl::memory::desc m_descDst; + }; + +public: + explicit DnnlExecutor(const dnnl::primitive_desc& pd); + void exec(const std::unordered_map& primArgs, dnnl::stream strm); + bool needReordering() const; + virtual ~DnnlExecutor() = default; + dnnl::primitive getExecPrim() const; + const_dnnl_primitive_desc_t getPrimitiveDesc() const; + impl_desc_type getImplementationType() const; + + DnnlMemoryDescPtr getSrcDesc() const { + return src_md; + } + DnnlMemoryDescPtr getWeightDesc() const { + return wghts_md; + } + DnnlMemoryDescPtr getDstDesc() const { + return dst_md; + } + DnnlMemoryDescPtr getScratchPadDesc() const { + return scrch_md; + } + + const dnnl::memory::desc& getDnnlSrcDesc() const { + return src_md->getDnnlDesc(); + } + const dnnl::memory::desc& getDnnlWeightDesc() const { + return wghts_md->getDnnlDesc(); + } + const dnnl::memory::desc& getDnnlDstDesc() const { + return dst_md->getDnnlDesc(); + } + const dnnl::memory::desc& getDnnlScratchPadDesc() const { + return scrch_md->getDnnlDesc(); + } - protected: - virtual void reorder_exec(std::unordered_map primArgs, dnnl::stream strm); +protected: + virtual void reorder_exec(std::unordered_map primArgs, dnnl::stream strm); - protected: - dnnl::primitive execPrim; - // key is the port number for the primitive that needs memory reordering - std::unordered_map inputReorders; - std::unordered_map outputReorders; - DnnlMemoryDescPtr src_md; - DnnlMemoryDescPtr wghts_md; - DnnlMemoryDescPtr dst_md; - DnnlMemoryDescPtr scrch_md; +protected: + dnnl::primitive execPrim; + // key is the port number for the primitive that needs memory reordering + std::unordered_map inputReorders; + std::unordered_map outputReorders; + DnnlMemoryDescPtr src_md; + DnnlMemoryDescPtr wghts_md; + DnnlMemoryDescPtr dst_md; + DnnlMemoryDescPtr scrch_md; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h b/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h index daedcc4bf23ca4..b6622f7ae54d0b 100644 --- a/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h +++ b/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h @@ -13,7 +13,7 @@ typedef short ie_fp16; // F32: exp_bias:127 SEEEEEEE EMMMMMMM MMMMMMMM MMMMMMMM. // F16: exp_bias:15 SEEEEEMM MMMMMMMM #define EXP_MASK_F32 0x7F800000U -#define EXP_MASK_F16 0x7C00U +#define EXP_MASK_F16 0x7C00U // small helper function to represent uint32_t value as float32 inline float asfloat(uint32_t v) { @@ -83,5 +83,5 @@ inline float f16tof32(ie_fp16 x) { return asfloat(u); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp index 396cebc1ba82e1..60bd675d726e4a 100644 --- a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp @@ -6,15 +6,14 @@ #include -#include "dnnl_types.h" -#include "dnnl_extension_utils.h" -#include "cpu_memcpy.h" -#include "utils/bfloat16.hpp" - -#include "cpu/x64/jit_generator.hpp" #include "common/primitive_hashing_utils.hpp" -#include "nodes/executors/transpose.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "cpu_memcpy.h" +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" #include "nodes/executors/common/ref_transpose.hpp" +#include "nodes/executors/transpose.hpp" +#include "utils/bfloat16.hpp" using namespace dnnl; using namespace dnnl::impl; @@ -33,7 +32,9 @@ template struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_permute_kernel_f32) - explicit jit_uni_permute_kernel_f32(jit_permute_config_params jcp_) : jit_uni_permute_kernel(jcp_), jit_generator(jit_name()) {} + explicit jit_uni_permute_kernel_f32(jit_permute_config_params jcp_) + : jit_uni_permute_kernel(jcp_), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -51,23 +52,43 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge this->postamble(); } - void load(const Xbyak::Xmm &xmm, const Xbyak::Address &addr) { + void load(const Xbyak::Xmm& xmm, const Xbyak::Address& addr) { switch (jcp.data_size) { - case 16: uni_vmovups(xmm, addr); break; - case 8: uni_vmovsd(xmm, addr); break; - case 4: uni_vmovss(xmm, addr); break; - case 2: uni_vpinsrw(xmm, xmm, addr, 0x0); break; - case 1: uni_vpinsrb(xmm, xmm, addr, 0x0); break; + case 16: + uni_vmovups(xmm, addr); + break; + case 8: + uni_vmovsd(xmm, addr); + break; + case 4: + uni_vmovss(xmm, addr); + break; + case 2: + uni_vpinsrw(xmm, xmm, addr, 0x0); + break; + case 1: + uni_vpinsrb(xmm, xmm, addr, 0x0); + break; } } - void store(const Xbyak::Address &addr, const Xbyak::Xmm &xmm) { + void store(const Xbyak::Address& addr, const Xbyak::Xmm& xmm) { switch (jcp.data_size) { - case 16: uni_vmovups(addr, xmm); break; - case 8: uni_vmovsd(addr, xmm); break; - case 4: uni_vmovss(addr, xmm); break; - case 2: uni_vpextrw(addr, xmm, 0x0); break; - case 1: uni_vpextrb(addr, xmm, 0x0); break; + case 16: + uni_vmovups(addr, xmm); + break; + case 8: + uni_vmovsd(addr, xmm); + break; + case 4: + uni_vmovss(addr, xmm); + break; + case 2: + uni_vpextrw(addr, xmm, 0x0); + break; + case 1: + uni_vpextrb(addr, xmm, 0x0); + break; } } @@ -99,7 +120,8 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge } } - L(tail_loop_label); { + L(tail_loop_label); + { cmp(reg_work_amount, 0); je(exit_label, T_NEAR); @@ -129,7 +151,8 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; uint32_t vlen = cpu_isa_traits::vlen; Xbyak::Reg64 reg_src = r8; @@ -144,7 +167,7 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge Xbyak::Xmm xmm = Xbyak::Xmm(1); }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 PermuteKernel::PermuteKernel(const PermuteParams& params) : params(params) { jcp = TransposeExecutor::prepareParams(params); @@ -156,7 +179,7 @@ PermuteKernel::PermuteKernel(const PermuteParams& params) : params(params) { } else if (mayiuse(cpu::x64::sse41)) { permute_kernel.reset(new jit_uni_permute_kernel_f32(jcp)); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 if (permute_kernel) permute_kernel->create_ker(); @@ -178,7 +201,7 @@ void PermuteKernel::execute(const uint8_t* src_data, uint8_t* dst_data) { return; } - RefTransposeExecutor::referenceExecute(src_data, dst_data, jcp, dst_dims[0]); + RefTransposeExecutor::referenceExecute(src_data, dst_data, jcp, dst_dims[0]); } void PermuteKernel::optimizedExecute(const uint8_t* src_data, uint8_t* dst_data, const int mb) { @@ -190,42 +213,42 @@ void PermuteKernel::optimizedExecute(const uint8_t* src_data, uint8_t* dst_data, dst_dims[0] = mb; switch (jcp.n) { - case 1: - parallel_for(dst_dims[0], [&](int i0) { - auto arg = jit_args_permute(); - - size_t dst_off = i0 * dst_strides[0]; - size_t src_off = i0 * src_strides[0]; - arg.src = &src_data[src_off * jcp.data_size]; - arg.dst = &dst_data[dst_off * jcp.data_size]; - - (*permute_kernel)(&arg); - }); - break; - case 2: - parallel_for2d(dst_dims[0], dst_dims[1], [&](int i0, int i1) { - auto arg = jit_args_permute(); - - size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1]; - size_t src_off = i0 * src_strides[0] + i1 * src_strides[1]; - arg.src = &src_data[src_off * jcp.data_size]; - arg.dst = &dst_data[dst_off * jcp.data_size]; - - (*permute_kernel)(&arg); - }); - break; - case 3: - parallel_for3d(dst_dims[0], dst_dims[1], dst_dims[2], [&](int i0, int i1, int i2) { - auto arg = jit_args_permute(); - - size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1] + i2 * dst_strides[2]; - size_t src_off = i0 * src_strides[0] + i1 * src_strides[1] + i2 * src_strides[2]; - arg.src = &src_data[src_off * jcp.data_size]; - arg.dst = &dst_data[dst_off * jcp.data_size]; - - (*permute_kernel)(&arg); - }); - break; + case 1: + parallel_for(dst_dims[0], [&](int i0) { + auto arg = jit_args_permute(); + + size_t dst_off = i0 * dst_strides[0]; + size_t src_off = i0 * src_strides[0]; + arg.src = &src_data[src_off * jcp.data_size]; + arg.dst = &dst_data[dst_off * jcp.data_size]; + + (*permute_kernel)(&arg); + }); + break; + case 2: + parallel_for2d(dst_dims[0], dst_dims[1], [&](int i0, int i1) { + auto arg = jit_args_permute(); + + size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1]; + size_t src_off = i0 * src_strides[0] + i1 * src_strides[1]; + arg.src = &src_data[src_off * jcp.data_size]; + arg.dst = &dst_data[dst_off * jcp.data_size]; + + (*permute_kernel)(&arg); + }); + break; + case 3: + parallel_for3d(dst_dims[0], dst_dims[1], dst_dims[2], [&](int i0, int i1, int i2) { + auto arg = jit_args_permute(); + + size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1] + i2 * dst_strides[2]; + size_t src_off = i0 * src_strides[0] + i1 * src_strides[1] + i2 * src_strides[2]; + arg.src = &src_data[src_off * jcp.data_size]; + arg.dst = &dst_data[dst_off * jcp.data_size]; + + (*permute_kernel)(&arg); + }); + break; } return; } @@ -245,12 +268,10 @@ size_t PermuteParams::hash() const { } bool PermuteParams::operator==(const PermuteParams& rhs) const { - return (src_block_dims == rhs.src_block_dims) && - (dst_block_dims == rhs.dst_block_dims) && - (src_block_order == rhs.src_block_order) && - (dst_block_order == rhs.dst_block_order) && (order == rhs.order) && - (data_size == rhs.data_size); + return (src_block_dims == rhs.src_block_dims) && (dst_block_dims == rhs.dst_block_dims) && + (src_block_order == rhs.src_block_order) && (dst_block_order == rhs.dst_block_order) && + (order == rhs.order) && (data_size == rhs.data_size); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h index ac665efb4f0bb6..ba7a89d746d945 100644 --- a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h +++ b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h @@ -38,9 +38,9 @@ struct jit_args_permute { }; struct jit_uni_permute_kernel { - void (*ker_)(const jit_args_permute *); + void (*ker_)(const jit_args_permute*); - void operator()(const jit_args_permute *args) { + void operator()(const jit_args_permute* args) { assert(ker_); ker_(args); } @@ -71,5 +71,5 @@ class PermuteKernel { PermuteParams params; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp b/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp index 93e145b25b9e95..dd07a721260aac 100644 --- a/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp @@ -4,15 +4,14 @@ #include "reorder_prim.h" -#include "dnnl_extension_utils.h" -#include "dnnl_types.h" - #include -#include "common/primitive_hashing_utils.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" #include #include +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/cpu_isa_traits.hpp" +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" #include "utils/general_utils.h" namespace ov { diff --git a/src/plugins/intel_cpu/src/nodes/common/softmax.cpp b/src/plugins/intel_cpu/src/nodes/common/softmax.cpp index 66a6ca9c1b6f53..0fcc87f8978752 100644 --- a/src/plugins/intel_cpu/src/nodes/common/softmax.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/softmax.cpp @@ -4,17 +4,17 @@ #include "softmax.h" -#include "openvino/core/parallel.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" -#include "onednn/dnnl.h" -#include "utils/bfloat16.hpp" -#include "emitters/plugin/x64/jit_bf16_emitters.hpp" - #include #include #include +#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "emitters/plugin/x64/jit_bf16_emitters.hpp" +#include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" +#include "utils/bfloat16.hpp" + using namespace dnnl; using namespace dnnl::impl::cpu; using namespace dnnl::impl::cpu::x64; @@ -38,11 +38,13 @@ struct jit_softmax_config_params { ov::element::Type dst_dt; }; - struct jit_uni_softmax_kernel { - void (*ker_)(const jit_args_softmax *); + void (*ker_)(const jit_args_softmax*); - void operator()(const jit_args_softmax *args) { assert(ker_); ker_(args); } + void operator()(const jit_args_softmax* args) { + assert(ker_); + ker_(args); + } jit_uni_softmax_kernel() : ker_(nullptr) {} virtual ~jit_uni_softmax_kernel() {} @@ -54,7 +56,10 @@ template struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32) - jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator(jit_name()), jcp_(jcp) {} + jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) + : jit_uni_softmax_kernel(), + jit_generator(jit_name()), + jcp_(jcp) {} void create_ker() override { jit_generator::create_kernel(); @@ -62,14 +67,14 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge } void generate() override { - exp_injector.reset(new jit_uni_eltwise_injector_f32(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f)); + exp_injector.reset( + new jit_uni_eltwise_injector_f32(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f)); if (mayiuse(avx512_core)) uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa)); this->preamble(); - mov(reg_src, ptr[reg_params + GET_OFF(src)]); mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]); @@ -86,7 +91,8 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge mov(aux_reg_work_amount, reg_work_amount); mov(aux_reg_src, reg_src); load_vector(vmm_max, ptr[aux_reg_src], jcp_.src_dt); - L(max_loop_label); { + L(max_loop_label); + { cmp(aux_reg_work_amount, 0); jle(max_loop_end_label, T_NEAR); @@ -120,7 +126,8 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge mov(aux_reg_src, reg_src); mov(aux_reg_dst, reg_dst); uni_vpxor(vmm_exp_sum, vmm_exp_sum, vmm_exp_sum); - L(exp_loop_label); { + L(exp_loop_label); + { cmp(aux_reg_work_amount, 0); jle(exp_loop_end_label, T_NEAR); @@ -143,7 +150,8 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge mov(aux_reg_work_amount, reg_work_amount); mov(aux_reg_dst, reg_dst); - L(div_loop_label); { + L(div_loop_label); + { cmp(aux_reg_work_amount, 0); jle(div_loop_end_label, T_NEAR); @@ -196,38 +204,40 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge jit_softmax_config_params jcp_; - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, ov::element::Type src_dt) { + inline void load_vector(Vmm vmm_src, const Xbyak::Address& op, ov::element::Type src_dt) { switch (src_dt) { - case ov::element::f32: - uni_vmovups(vmm_src, op); - break; - case ov::element::bf16: - vpmovzxwd(vmm_src, op); - uni_vpslld(vmm_src, vmm_src, 16); - break; - default: - assert(!"unknown src_dt"); + case ov::element::f32: + uni_vmovups(vmm_src, op); + break; + case ov::element::bf16: + vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; + default: + assert(!"unknown src_dt"); } } - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, ov::element::Type dst_dt) { + inline void store_vector(const Xbyak::Address& op, Vmm vmm_dst, ov::element::Type dst_dt) { Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); switch (dst_dt) { - case ov::element::f32: - uni_vmovups(op, vmm_dst); - break; - case ov::element::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); - break; - default: - assert(!"unknown dst_dt"); + case ov::element::f32: + uni_vmovups(op, vmm_dst); + break; + case ov::element::bf16: + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + break; + default: + assert(!"unknown dst_dt"); } } }; #endif SoftmaxGeneric::SoftmaxGeneric(ov::element::Type inpPrc, ov::element::Type outPrc) - : input_prec(inpPrc), output_prec(outPrc) { + : input_prec(inpPrc), + output_prec(outPrc) { if (ov::element::bf16 == output_prec) { if (!mayiuse(avx512_core)) { OPENVINO_THROW("SoftmaxGeneric doesn't support BF16 precision on this target."); @@ -255,27 +265,27 @@ SoftmaxGeneric::SoftmaxGeneric(ov::element::Type inpPrc, ov::element::Type outPr #endif } -template -void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, int B, int C, int H, int W) { +template +void SoftmaxGeneric::calculate(const in_data_t* src_data, out_data_t* dst_data, int B, int C, int H, int W) { for (int b = 0; b < B; b++) { int tail_start = 0; if (softmax_kernel) { - int blocks_num = H*W / block_size; + int blocks_num = H * W / block_size; parallel_for(blocks_num, [&](int ib) { auto arg = jit_args_softmax(); arg.src = src_data + b * C * H * W + ib * block_size; arg.dst = dst_data + b * C * H * W + ib * block_size; - arg.src_stride = static_cast((size_t)(H) * W * sizeof(in_data_t)); - arg.dst_stride = static_cast((size_t)(H) * W * sizeof(out_data_t)); + arg.src_stride = static_cast((size_t)(H)*W * sizeof(in_data_t)); + arg.dst_stride = static_cast((size_t)(H)*W * sizeof(out_data_t)); arg.work_amount = static_cast(C); (*softmax_kernel)(&arg); }); - tail_start = (H*W / block_size) * block_size; + tail_start = (H * W / block_size) * block_size; } parallel_for(H * W - tail_start, [&](int i) { @@ -283,7 +293,8 @@ void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, float max = src_data[b * C * H * W + offset]; for (int c = 0; c < C; c++) { float val = src_data[b * C * H * W + c * H * W + offset]; - if (val > max) max = val; + if (val > max) + max = val; } float expSum = 0; @@ -299,7 +310,7 @@ void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, } } -void SoftmaxGeneric::execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W) { +void SoftmaxGeneric::execute(const uint8_t* src_data, uint8_t* dst_data, int B, int C, int H, int W) { if (ov::element::f32 == input_prec) { auto float_src_data = reinterpret_cast(src_data); if (ov::element::f32 == output_prec) { @@ -327,5 +338,5 @@ void SoftmaxGeneric::execute(const uint8_t *src_data, uint8_t *dst_data, int B, } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/softmax.h b/src/plugins/intel_cpu/src/nodes/common/softmax.h index 2e3d5caa4becee..bb450c2ac5a303 100644 --- a/src/plugins/intel_cpu/src/nodes/common/softmax.h +++ b/src/plugins/intel_cpu/src/nodes/common/softmax.h @@ -4,27 +4,28 @@ #pragma once -#include #include -#include "openvino/core/type/element_type.hpp" +#include + #include "defs.h" #include "openvino/core/parallel.hpp" +#include "openvino/core/type/element_type.hpp" namespace ov { namespace intel_cpu { struct jit_uni_softmax_kernel; -static inline -void softmax_many_batches(const float *src_data, float *dst_data, int B, int C, int H, int W) { +static inline void softmax_many_batches(const float* src_data, float* dst_data, int B, int C, int H, int W) { ov::parallel_for(B * H * W, [&](size_t i) { - const float *psrc = src_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; - float *pdst = dst_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; + const float* psrc = src_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; + float* pdst = dst_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; float max = psrc[i]; for (int c = 0; c < C; c++) { float val = psrc[c * H * W + i]; - if (val > max) max = val; + if (val > max) + max = val; } float expSum = 0; @@ -43,9 +44,10 @@ class SoftmaxGeneric { public: SoftmaxGeneric(ov::element::Type inpPrc, ov::element::Type outPrc); - void execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W); + void execute(const uint8_t* src_data, uint8_t* dst_data, int B, int C, int H, int W); + private: - template + template void calculate(const in_data_t* src_data, out_data_t* dst_data, int B, int C, int H, int W); private: @@ -54,5 +56,5 @@ class SoftmaxGeneric { std::shared_ptr softmax_kernel; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp index 6c62304ab22da7..f482b0876b3f4c 100644 --- a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp @@ -4,18 +4,17 @@ #include "tile_broadcast_utils.h" +#include + #include "cpu_convert.h" #include "cpu_memcpy.h" -#include "openvino/core/parallel.hpp" -#include #include "memory_desc/dnnl_blocked_memory_desc.h" - - +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { -VectorDims TileBroadcastCommon::calculateDenseStrides(const VectorDims &dims) { +VectorDims TileBroadcastCommon::calculateDenseStrides(const VectorDims& dims) { VectorDims strides(dims.size(), 1); for (int i = strides.size() - 2; i >= 0; i--) { @@ -25,8 +24,10 @@ VectorDims TileBroadcastCommon::calculateDenseStrides(const VectorDims &dims) { return strides; } -void TileBroadcastCommon::fillOptimizedDimsAndSrcStrides(const VectorDims& srcBlockedDims, const VectorDims& blockedRepeats, - VectorDims& optimizedDims, VectorDims& optimizedSrcStrides) { +void TileBroadcastCommon::fillOptimizedDimsAndSrcStrides(const VectorDims& srcBlockedDims, + const VectorDims& blockedRepeats, + VectorDims& optimizedDims, + VectorDims& optimizedSrcStrides) { optimizedDims.clear(); optimizedSrcStrides.clear(); VectorDims srcBlockedStrides = calculateDenseStrides(srcBlockedDims); @@ -60,10 +61,11 @@ void TileBroadcastCommon::fillOptimizedDimsAndSrcStrides(const VectorDims& srcBl } } -bool TileBroadcastCommon::canBeExecutedInBlockedLayout(VectorDims srcBlockedDims, VectorDims blockedRepeats, - const size_t elemsInBlock) { - if (srcBlockedDims.empty() || blockedRepeats.empty() || elemsInBlock == 0lu || srcBlockedDims[1] == Shape::UNDEFINED_DIM || - (blockedRepeats[1] != 1 && srcBlockedDims[1] % elemsInBlock != 0)) +bool TileBroadcastCommon::canBeExecutedInBlockedLayout(VectorDims srcBlockedDims, + VectorDims blockedRepeats, + const size_t elemsInBlock) { + if (srcBlockedDims.empty() || blockedRepeats.empty() || elemsInBlock == 0lu || + srcBlockedDims[1] == Shape::UNDEFINED_DIM || (blockedRepeats[1] != 1 && srcBlockedDims[1] % elemsInBlock != 0)) return false; srcBlockedDims[1] = div_up(srcBlockedDims[1], elemsInBlock); @@ -90,7 +92,7 @@ bool TileBroadcastCommon::canBeExecutedInNSPCLayout(VectorDims srcBlockedDims, V return optimizedDims.size() <= maxNDims; } -std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, size_t outSize) { +std::vector TileBroadcastCommon::getSupportedConfigs(const Node* node, size_t outSize) { std::vector supportedPrimitiveDescriptors; auto precision = node->getOriginalInputPrecisionAtPort(0); auto dataType = DnnlExtensionUtils::ElementTypeToDataType(precision); @@ -115,26 +117,31 @@ std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, config.inConfs[0].constant(constMap[0]); config.inConfs[1].inPlace(-1); config.inConfs[1].constant(constMap[1]); - config.inConfs[1].setMemDesc(std::make_shared(ov::element::i32, node->getInputShapeAtPort(1))); + config.inConfs[1].setMemDesc( + std::make_shared(ov::element::i32, node->getInputShapeAtPort(1))); if (config.inConfs.size() == 3) { config.inConfs[2].inPlace(-1); config.inConfs[2].constant(constMap[2]); - config.inConfs[2].setMemDesc(std::make_shared(ov::element::i32, node->getInputShapeAtPort(2))); + config.inConfs[2].setMemDesc( + std::make_shared(ov::element::i32, node->getInputShapeAtPort(2))); } config.outConfs.resize(outSize); auto pushDesc = [&](dnnl::memory::format_tag inFormat, dnnl::memory::format_tag outFormat) { - config.inConfs[0].setMemDesc(std::make_shared(node->getInputShapeAtPort(0), dataType, inFormat)); + config.inConfs[0].setMemDesc( + std::make_shared(node->getInputShapeAtPort(0), dataType, inFormat)); for (size_t i = 0; i < config.outConfs.size(); i++) { config.outConfs[i].inPlace(-1); config.outConfs[i].constant(false); - config.outConfs[i].setMemDesc(std::make_shared(node->getOutputShapeAtPort(0), dataType, outFormat)); + config.outConfs[i].setMemDesc( + std::make_shared(node->getOutputShapeAtPort(0), dataType, outFormat)); } supportedPrimitiveDescriptors.push_back({config, impl_desc_type::ref}); }; - if (!repeats.empty() && inDataShape.getRank() == outDataShapeRank && (outDataShapeRank == 4 || outDataShapeRank == 5)) { + if (!repeats.empty() && inDataShape.getRank() == outDataShapeRank && + (outDataShapeRank == 4 || outDataShapeRank == 5)) { if (canBeExecutedInBlockedLayout(srcDims, repeats, 16)) { if (outDataShapeRank == 4) { pushDesc(dnnl::memory::format_tag::nChw16c, dnnl::memory::format_tag::nChw16c); @@ -165,7 +172,8 @@ std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, for (size_t i = 0; i < config.outConfs.size(); i++) { config.outConfs[i].inPlace(-1); config.outConfs[i].constant(false); - config.outConfs[i].setMemDesc(std::make_shared(precision, node->getOutputShapeAtPort(i))); + config.outConfs[i].setMemDesc( + std::make_shared(precision, node->getOutputShapeAtPort(i))); } supportedPrimitiveDescriptors.push_back({config, impl_desc_type::ref}); } else { @@ -175,7 +183,9 @@ std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, return supportedPrimitiveDescriptors; } -bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& srcBlockedDims, VectorDims& dstBlockedDims) { +bool TileBroadcastCommon::prepareOptimizedParams(const Node* node, + VectorDims& srcBlockedDims, + VectorDims& dstBlockedDims) { while (srcBlockedDims.size() < dstBlockedDims.size()) { srcBlockedDims.insert(srcBlockedDims.begin(), 1); } @@ -186,7 +196,8 @@ bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& s blockedRepeats.push_back(1); } // for NSPC layouts - if (node->getBaseMemDescAtInputPort(0)->hasLayoutType(LayoutType::nspc) && one_of(node->getBaseMemDescAtInputPort(0)->getShape().getRank(), 4u, 5u)) { + if (node->getBaseMemDescAtInputPort(0)->hasLayoutType(LayoutType::nspc) && + one_of(node->getBaseMemDescAtInputPort(0)->getShape().getRank(), 4u, 5u)) { blockedRepeats.push_back(blockedRepeats[1]); blockedRepeats.erase(blockedRepeats.begin() + 1); } @@ -205,7 +216,8 @@ bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& s VectorDims optimizedDstStrides = calculateDenseStrides(optimizedDims); - size_t dataSize = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->getPrecision().size(); + size_t dataSize = + node->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->getPrecision().size(); for (size_t i = 0; i < optimizedDims.size(); i++) { optimizedSrcStrides[i] *= dataSize; optimizedDstStrides[i] *= dataSize; @@ -221,9 +233,9 @@ bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& s // Broadcast 1 element to N continuous elements based on cpu_memcpy // Step 1: Get the binary format of the number N -// Step 2: Use cpu_memcpy to form fragments containing pow(2, k) (ie. 2, 4, 8, ...) elements, based on the given 1 element -// Step 3: Form N continuous elements, who's a combination of those fragments, demonstrated by its binary format -void TileBroadcastCommon::broadcastScalar(const char *srcData, char *dstData, size_t elt_cnt, size_t data_size) { +// Step 2: Use cpu_memcpy to form fragments containing pow(2, k) (ie. 2, 4, 8, ...) elements, based on the given 1 +// element Step 3: Form N continuous elements, who's a combination of those fragments, demonstrated by its binary format +void TileBroadcastCommon::broadcastScalar(const char* srcData, char* dstData, size_t elt_cnt, size_t data_size) { std::vector binary_digits; binary_digits.clear(); @@ -275,32 +287,44 @@ void TileBroadcastCommon::optimizedExecute(const MemoryPtr& srcMemory, const Mem broadcastScalar(srcData, dstData, elt_cnt, data_size); } } else { - parallel_for5d(optimizedParams.dims[0], optimizedParams.dims[1], optimizedParams.dims[2], optimizedParams.dims[3], optimizedParams.dims[4], - [&](int i0, int i1, int i2, int i3, int i4) { - auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + - i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + - i4 * optimizedParams.srcStrides[4]); - auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + - i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + - i4 * optimizedParams.dstStrides[4]); - for (size_t i = 0; i < optimizedParams.dims[5]; i++) { - cpu_memcpy(dstData2 + i * optimizedParams.dstStrides[5], srcData2, optimizedParams.dstStrides[5]); - } - }); + parallel_for5d( + optimizedParams.dims[0], + optimizedParams.dims[1], + optimizedParams.dims[2], + optimizedParams.dims[3], + optimizedParams.dims[4], + [&](int i0, int i1, int i2, int i3, int i4) { + auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + + i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + + i4 * optimizedParams.srcStrides[4]); + auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + + i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + + i4 * optimizedParams.dstStrides[4]); + for (size_t i = 0; i < optimizedParams.dims[5]; i++) { + cpu_memcpy(dstData2 + i * optimizedParams.dstStrides[5], + srcData2, + optimizedParams.dstStrides[5]); + } + }); } } else { - parallel_for5d(optimizedParams.dims[0], optimizedParams.dims[1], optimizedParams.dims[2], optimizedParams.dims[3], optimizedParams.dims[4], - [&](int i0, int i1, int i2, int i3, int i4) { - auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + - i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + - i4 * optimizedParams.srcStrides[4]); - auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + + parallel_for5d( + optimizedParams.dims[0], + optimizedParams.dims[1], + optimizedParams.dims[2], + optimizedParams.dims[3], + optimizedParams.dims[4], + [&](int i0, int i1, int i2, int i3, int i4) { + auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + + i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + + i4 * optimizedParams.srcStrides[4]); + auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + i4 * optimizedParams.dstStrides[4]); - cpu_memcpy(dstData2, srcData2, optimizedParams.copySize); - }); + cpu_memcpy(dstData2, srcData2, optimizedParams.copySize); + }); } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h index 7ae0eacbccd373..6638eba7f88a39 100644 --- a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h +++ b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h @@ -9,27 +9,28 @@ #include #include - namespace ov { namespace intel_cpu { class TileBroadcastCommon { protected: - static VectorDims calculateDenseStrides(const VectorDims &dims); - std::vector getSupportedConfigs(const Node *node, size_t outSize); - bool prepareOptimizedParams(const Node *node, VectorDims& srcBlockedDims, VectorDims& dstBlockedDims); + static VectorDims calculateDenseStrides(const VectorDims& dims); + std::vector getSupportedConfigs(const Node* node, size_t outSize); + bool prepareOptimizedParams(const Node* node, VectorDims& srcBlockedDims, VectorDims& dstBlockedDims); void optimizedExecute(const MemoryPtr& srcMemory, const MemoryPtr& dstMemory); VectorDims repeats; bool optimizedCase = false; - bool constMap[3] = { false }; + bool constMap[3] = {false}; mutable bool needPrepareParamsVar = false; private: - static void fillOptimizedDimsAndSrcStrides(const VectorDims &srcBlockedDims, const VectorDims &blockedRepeats, - VectorDims &optimizedDims, VectorDims &optimizedSrcStrides); - static void broadcastScalar(const char *srcData, char *dstData, size_t elt_cnt, size_t data_size); + static void fillOptimizedDimsAndSrcStrides(const VectorDims& srcBlockedDims, + const VectorDims& blockedRepeats, + VectorDims& optimizedDims, + VectorDims& optimizedSrcStrides); + static void broadcastScalar(const char* srcData, char* dstData, size_t elt_cnt, size_t data_size); static bool canBeExecutedInBlockedLayout(VectorDims srcDims, VectorDims repeats, const size_t elemsInBlock); static bool canBeExecutedInNSPCLayout(VectorDims srcDims, VectorDims repeats); @@ -42,5 +43,5 @@ class TileBroadcastCommon { } optimizedParams; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/uni_simd.h b/src/plugins/intel_cpu/src/nodes/common/uni_simd.h index 7f2cdc7bed4821..dbcec60baa7d4c 100644 --- a/src/plugins/intel_cpu/src/nodes/common/uni_simd.h +++ b/src/plugins/intel_cpu/src/nodes/common/uni_simd.h @@ -5,7 +5,7 @@ #pragma once #if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) -#include +# include #endif namespace ov { @@ -14,348 +14,350 @@ namespace Cpu { #if defined(HAVE_AVX512F) namespace AVX512F { - static inline __m512 _mm_uni_any_ps() { - return __m512{}; - } - - static inline __m512i _mm_uni_any_epi32() { - return __m512i{}; - } - - static inline __m512 _mm_uni_loadu_ps(const float* psrc) { - return _mm512_mask_loadu_ps(_mm_uni_any_ps(), (__mmask16)-1, psrc); - } - - static inline void _mm_uni_storeu_ps(float* pdst, const __m512& vec) { - _mm512_storeu_ps(pdst, vec); - } - - static inline void _mm_uni_storeu_si(void* pdst, const __m512i vec) { - _mm512_storeu_si512(pdst, vec); - } - - static inline __m512 _mm_uni_setzero_ps() { - return _mm512_setzero_ps(); - } - - static inline __m512 _mm_uni_set1_ps(float value) { - return _mm512_set1_ps(value); - } - - static inline __m512 _mm_uni_add_ps(__m512 vec0, __m512 vec1) { - return _mm512_add_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_sub_ps(__m512 vec0, __m512 vec1) { - return _mm512_sub_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_mul_ps(__m512 vec0, __m512 vec1) { - return _mm512_mul_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_div_ps(__m512 vec0, __m512 vec1) { - return _mm512_div_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_sqrt_ps(__m512 vec) { - return _mm512_sqrt_ps(vec); - } - - static inline __m512 _mm_uni_and_ps(__m512 vec0, __m512 vec1) { - return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); - } - - static inline __m512 _mm_uni_or_ps(__m512 vec0, __m512 vec1) { - return _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); - } - - static inline __m512i _mm_uni_set1_epi32(int value) { - return _mm512_mask_set1_epi32(_mm_uni_any_epi32(), (__mmask16)-1, value); - } - - static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __m512 vmask) { - return _mm512_mask_blend_ps(_mm512_cmpneq_epi32_mask(_mm512_castps_si512(vmask), _mm_uni_set1_epi32(0)), vec0, vec1); - } - - static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __mmask16 vmask) { - return _mm512_mask_blend_ps(vmask, vec0, vec1); - } - - static inline __m512 _mm_uni_min_ps(__m512 vec0, __m512 vec1) { - return _mm512_min_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_max_ps(__m512 vec0, __m512 vec1) { - return _mm512_max_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_floor_ps(__m512 vec) { - return _mm512_floor_ps(vec); - } - - static inline __m512i _mm_uni_cvtps_epi32(__m512 vec) { - return _mm512_cvtps_epi32(vec); - } - - static inline __m512i _mm_uni_add_epi32(__m512i vec0, __m512i vec1) { - return _mm512_add_epi32(vec0, vec1); - } - - static inline __m512i _mm_uni_slli_epi32(__m512i vec, int value) { - return _mm512_sll_epi32(vec, _mm_set1_epi64x(value)); - } - - static inline __m512 _mm_uni_castsi_ps(__m512i vec) { - return _mm512_castsi512_ps(vec); - } - - static inline __m512i _mm_uni_setzero_si() { - return _mm512_setzero_si512(); - } - - static inline __mmask16 _mm_uni_cmpgt_ps(__m512 vec0, __m512 vec1) { - return _mm512_cmp_ps_mask(vec0, vec1, 14); - } - - static inline __mmask16 _mm_uni_cmpgt_i32(__m512i vec0, __m512i vec1) { - return _mm512_cmp_epi32_mask(vec1, vec0, 1); - } - - static inline __m512i _mm_uni_castps_si(__m512 vec) { - return _mm512_castps_si512(vec); - } - - static inline __m512 _mm_uni_cvtepi32_ps(__m512i vec) { - return _mm512_mask_cvtepi32_ps(_mm_uni_any_ps(), (__mmask16)-1, vec); - } +static inline __m512 _mm_uni_any_ps() { + return __m512{}; +} + +static inline __m512i _mm_uni_any_epi32() { + return __m512i{}; +} + +static inline __m512 _mm_uni_loadu_ps(const float* psrc) { + return _mm512_mask_loadu_ps(_mm_uni_any_ps(), (__mmask16)-1, psrc); +} + +static inline void _mm_uni_storeu_ps(float* pdst, const __m512& vec) { + _mm512_storeu_ps(pdst, vec); +} + +static inline void _mm_uni_storeu_si(void* pdst, const __m512i vec) { + _mm512_storeu_si512(pdst, vec); +} + +static inline __m512 _mm_uni_setzero_ps() { + return _mm512_setzero_ps(); +} + +static inline __m512 _mm_uni_set1_ps(float value) { + return _mm512_set1_ps(value); +} + +static inline __m512 _mm_uni_add_ps(__m512 vec0, __m512 vec1) { + return _mm512_add_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_sub_ps(__m512 vec0, __m512 vec1) { + return _mm512_sub_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_mul_ps(__m512 vec0, __m512 vec1) { + return _mm512_mul_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_div_ps(__m512 vec0, __m512 vec1) { + return _mm512_div_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_sqrt_ps(__m512 vec) { + return _mm512_sqrt_ps(vec); +} + +static inline __m512 _mm_uni_and_ps(__m512 vec0, __m512 vec1) { + return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); +} + +static inline __m512 _mm_uni_or_ps(__m512 vec0, __m512 vec1) { + return _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); +} + +static inline __m512i _mm_uni_set1_epi32(int value) { + return _mm512_mask_set1_epi32(_mm_uni_any_epi32(), (__mmask16)-1, value); +} + +static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __m512 vmask) { + return _mm512_mask_blend_ps(_mm512_cmpneq_epi32_mask(_mm512_castps_si512(vmask), _mm_uni_set1_epi32(0)), + vec0, + vec1); +} + +static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __mmask16 vmask) { + return _mm512_mask_blend_ps(vmask, vec0, vec1); +} + +static inline __m512 _mm_uni_min_ps(__m512 vec0, __m512 vec1) { + return _mm512_min_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_max_ps(__m512 vec0, __m512 vec1) { + return _mm512_max_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_floor_ps(__m512 vec) { + return _mm512_floor_ps(vec); +} + +static inline __m512i _mm_uni_cvtps_epi32(__m512 vec) { + return _mm512_cvtps_epi32(vec); +} + +static inline __m512i _mm_uni_add_epi32(__m512i vec0, __m512i vec1) { + return _mm512_add_epi32(vec0, vec1); +} + +static inline __m512i _mm_uni_slli_epi32(__m512i vec, int value) { + return _mm512_sll_epi32(vec, _mm_set1_epi64x(value)); +} + +static inline __m512 _mm_uni_castsi_ps(__m512i vec) { + return _mm512_castsi512_ps(vec); +} + +static inline __m512i _mm_uni_setzero_si() { + return _mm512_setzero_si512(); +} + +static inline __mmask16 _mm_uni_cmpgt_ps(__m512 vec0, __m512 vec1) { + return _mm512_cmp_ps_mask(vec0, vec1, 14); +} + +static inline __mmask16 _mm_uni_cmpgt_i32(__m512i vec0, __m512i vec1) { + return _mm512_cmp_epi32_mask(vec1, vec0, 1); +} + +static inline __m512i _mm_uni_castps_si(__m512 vec) { + return _mm512_castps_si512(vec); +} + +static inline __m512 _mm_uni_cvtepi32_ps(__m512i vec) { + return _mm512_mask_cvtepi32_ps(_mm_uni_any_ps(), (__mmask16)-1, vec); +} } // namespace AVX512F #elif defined(HAVE_AVX2) namespace AVX2 { - static inline __m256 _mm_uni_loadu_ps(const float* psrc) { - return _mm256_loadu_ps(psrc); - } +static inline __m256 _mm_uni_loadu_ps(const float* psrc) { + return _mm256_loadu_ps(psrc); +} - static inline void _mm_uni_storeu_ps(float* pdst, const __m256 vec) { - _mm256_storeu_ps(pdst, vec); - } +static inline void _mm_uni_storeu_ps(float* pdst, const __m256 vec) { + _mm256_storeu_ps(pdst, vec); +} - static inline void _mm_uni_storeu_si(__m256i* pdst, const __m256i vec) { - _mm256_storeu_si256(pdst, vec); - } +static inline void _mm_uni_storeu_si(__m256i* pdst, const __m256i vec) { + _mm256_storeu_si256(pdst, vec); +} - static inline __m256 _mm_uni_setzero_ps() { - return _mm256_setzero_ps(); - } +static inline __m256 _mm_uni_setzero_ps() { + return _mm256_setzero_ps(); +} - static inline __m256 _mm_uni_set1_ps(float value) { - return _mm256_set1_ps(value); - } +static inline __m256 _mm_uni_set1_ps(float value) { + return _mm256_set1_ps(value); +} - static inline __m256 _mm_uni_add_ps(__m256 vec0, __m256 vec1) { - return _mm256_add_ps(vec0, vec1); - } +static inline __m256 _mm_uni_add_ps(__m256 vec0, __m256 vec1) { + return _mm256_add_ps(vec0, vec1); +} - static inline __m256 _mm_uni_sub_ps(__m256 vec0, __m256 vec1) { - return _mm256_sub_ps(vec0, vec1); - } +static inline __m256 _mm_uni_sub_ps(__m256 vec0, __m256 vec1) { + return _mm256_sub_ps(vec0, vec1); +} - static inline __m256 _mm_uni_mul_ps(__m256 vec0, __m256 vec1) { - return _mm256_mul_ps(vec0, vec1); - } +static inline __m256 _mm_uni_mul_ps(__m256 vec0, __m256 vec1) { + return _mm256_mul_ps(vec0, vec1); +} - static inline __m256 _mm_uni_div_ps(__m256 vec0, __m256 vec1) { - return _mm256_div_ps(vec0, vec1); - } +static inline __m256 _mm_uni_div_ps(__m256 vec0, __m256 vec1) { + return _mm256_div_ps(vec0, vec1); +} - static inline __m256 _mm_uni_sqrt_ps(__m256 vec) { - return _mm256_sqrt_ps(vec); - } +static inline __m256 _mm_uni_sqrt_ps(__m256 vec) { + return _mm256_sqrt_ps(vec); +} - static inline __m256 _mm_uni_and_ps(__m256 vec0, __m256 vec1) { - return _mm256_and_ps(vec0, vec1); - } +static inline __m256 _mm_uni_and_ps(__m256 vec0, __m256 vec1) { + return _mm256_and_ps(vec0, vec1); +} - static inline __m256 _mm_uni_or_ps(__m256 vec0, __m256 vec1) { - return _mm256_or_ps(vec0, vec1); - } +static inline __m256 _mm_uni_or_ps(__m256 vec0, __m256 vec1) { + return _mm256_or_ps(vec0, vec1); +} - static inline __m256 _mm_uni_blendv_ps(__m256 vec0, __m256 vec1, __m256 vmask) { - return _mm256_blendv_ps(vec0, vec1, vmask); - } +static inline __m256 _mm_uni_blendv_ps(__m256 vec0, __m256 vec1, __m256 vmask) { + return _mm256_blendv_ps(vec0, vec1, vmask); +} - static inline __m256 _mm_uni_min_ps(__m256 vec0, __m256 vec1) { - return _mm256_min_ps(vec0, vec1); - } +static inline __m256 _mm_uni_min_ps(__m256 vec0, __m256 vec1) { + return _mm256_min_ps(vec0, vec1); +} - static inline __m256 _mm_uni_max_ps(__m256 vec0, __m256 vec1) { - return _mm256_max_ps(vec0, vec1); - } +static inline __m256 _mm_uni_max_ps(__m256 vec0, __m256 vec1) { + return _mm256_max_ps(vec0, vec1); +} - static inline __m256 _mm_uni_floor_ps(__m256 vec) { - return _mm256_floor_ps(vec); - } +static inline __m256 _mm_uni_floor_ps(__m256 vec) { + return _mm256_floor_ps(vec); +} - static inline __m256i _mm_uni_cvtps_epi32(__m256 vec) { - return _mm256_cvtps_epi32(vec); - } +static inline __m256i _mm_uni_cvtps_epi32(__m256 vec) { + return _mm256_cvtps_epi32(vec); +} - static inline __m256i _mm_uni_add_epi32(__m256i vec0, __m256i vec1) { - return _mm256_add_epi32(vec0, vec1); - } +static inline __m256i _mm_uni_add_epi32(__m256i vec0, __m256i vec1) { + return _mm256_add_epi32(vec0, vec1); +} - static inline __m256i _mm_uni_set1_epi32(int value) { - return _mm256_set1_epi32(value); - } +static inline __m256i _mm_uni_set1_epi32(int value) { + return _mm256_set1_epi32(value); +} - static inline __m256i _mm_uni_slli_epi32(__m256i vec, int value) { - return _mm256_slli_epi32(vec, value); - } +static inline __m256i _mm_uni_slli_epi32(__m256i vec, int value) { + return _mm256_slli_epi32(vec, value); +} - static inline __m256 _mm_uni_castsi_ps(__m256i vec) { - return _mm256_castsi256_ps(vec); - } +static inline __m256 _mm_uni_castsi_ps(__m256i vec) { + return _mm256_castsi256_ps(vec); +} - static inline __m256i _mm_uni_setzero_si() { - return _mm256_setzero_si256(); - } +static inline __m256i _mm_uni_setzero_si() { + return _mm256_setzero_si256(); +} - static inline __m256 _mm_uni_cmpgt_ps(__m256 vec0, __m256 vec1) { - return _mm256_cmp_ps(vec0, vec1, 14); - } +static inline __m256 _mm_uni_cmpgt_ps(__m256 vec0, __m256 vec1) { + return _mm256_cmp_ps(vec0, vec1, 14); +} - static inline __m256 _mm_uni_cmpgt_i32(__m256i vec0, __m256i vec1) { - return _mm256_cvtepi32_ps(_mm256_cmpgt_epi32(vec0, vec1)); - } +static inline __m256 _mm_uni_cmpgt_i32(__m256i vec0, __m256i vec1) { + return _mm256_cvtepi32_ps(_mm256_cmpgt_epi32(vec0, vec1)); +} - static inline __m256i _mm_uni_blendv_epi8(__m256i vec0, __m256i vec1, __m256i vmask) { - return _mm256_blendv_epi8(vec0, vec1, vmask); - } +static inline __m256i _mm_uni_blendv_epi8(__m256i vec0, __m256i vec1, __m256i vmask) { + return _mm256_blendv_epi8(vec0, vec1, vmask); +} - static inline __m256i _mm_uni_castps_si(__m256 vec) { - return _mm256_castps_si256(vec); - } +static inline __m256i _mm_uni_castps_si(__m256 vec) { + return _mm256_castps_si256(vec); +} - static inline __m256 _mm_uni_cvtepi32_ps(__m256i vec) { - return _mm256_cvtepi32_ps(vec); - } +static inline __m256 _mm_uni_cvtepi32_ps(__m256i vec) { + return _mm256_cvtepi32_ps(vec); +} - static inline int _mm_uni_movemask_ps(__m256 vec) { - return _mm256_movemask_ps(vec); - } +static inline int _mm_uni_movemask_ps(__m256 vec) { + return _mm256_movemask_ps(vec); +} } // namespace AVX2 #elif defined(HAVE_SSE42) namespace SSE42 { - static inline __m128 _mm_uni_loadu_ps(const float* psrc) { - return _mm_loadu_ps(psrc); - } +static inline __m128 _mm_uni_loadu_ps(const float* psrc) { + return _mm_loadu_ps(psrc); +} - static inline void _mm_uni_storeu_ps(float* pdst, const __m128 vec) { - _mm_storeu_ps(pdst, vec); - } - - static inline void _mm_uni_storeu_si(__m128i* pdst, const __m128i vec) { - _mm_storeu_si128(pdst, vec); - } - - static inline __m128 _mm_uni_setzero_ps() { - return _mm_setzero_ps(); - } - - static inline __m128 _mm_uni_set1_ps(float value) { - return _mm_set1_ps(value); - } - - static inline __m128 _mm_uni_add_ps(__m128 vec0, __m128 vec1) { - return _mm_add_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_sub_ps(__m128 vec0, __m128 vec1) { - return _mm_sub_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_mul_ps(__m128 vec0, __m128 vec1) { - return _mm_mul_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_div_ps(__m128 vec0, __m128 vec1) { - return _mm_div_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_sqrt_ps(__m128 vec) { - return _mm_sqrt_ps(vec); - } - - static inline __m128 _mm_uni_and_ps(__m128 vec0, __m128 vec1) { - return _mm_and_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_or_ps(__m128 vec0, __m128 vec1) { - return _mm_or_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_blendv_ps(__m128 vec0, __m128 vec1, __m128 vmask) { - return _mm_blendv_ps(vec0, vec1, vmask); - } - - static inline __m128 _mm_uni_min_ps(__m128 vec0, __m128 vec1) { - return _mm_min_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_max_ps(__m128 vec0, __m128 vec1) { - return _mm_max_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_floor_ps(__m128 vec) { - return _mm_floor_ps(vec); - } - - static inline __m128i _mm_uni_cvtps_epi32(__m128 vec) { - return _mm_cvtps_epi32(vec); - } - - static inline __m128i _mm_uni_add_epi32(__m128i vec0, __m128i vec1) { - return _mm_add_epi32(vec0, vec1); - } - - static inline __m128i _mm_uni_set1_epi32(int value) { - return _mm_set1_epi32(value); - } - - static inline __m128i _mm_uni_slli_epi32(__m128i vec, int value) { - return _mm_slli_epi32(vec, value); - } - - static inline __m128 _mm_uni_castsi_ps(__m128i vec) { - return _mm_castsi128_ps(vec); - } - - static inline __m128i _mm_uni_setzero_si() { - return _mm_setzero_si128(); - } - - static inline __m128 _mm_uni_cmpgt_ps(__m128 vec0, __m128 vec1) { - return _mm_cmpgt_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_cmpgt_i32(__m128i vec0, __m128i vec1) { - return _mm_cvtepi32_ps(_mm_cmpgt_epi32(vec0, vec1)); - } - - static inline __m128i _mm_uni_blendv_epi8(__m128i vec0, __m128i vec1, __m128i vmask) { - return _mm_blendv_epi8(vec0, vec1, vmask); - } - - static inline __m128i _mm_uni_castps_si(__m128 vec) { - return _mm_castps_si128(vec); - } +static inline void _mm_uni_storeu_ps(float* pdst, const __m128 vec) { + _mm_storeu_ps(pdst, vec); +} + +static inline void _mm_uni_storeu_si(__m128i* pdst, const __m128i vec) { + _mm_storeu_si128(pdst, vec); +} + +static inline __m128 _mm_uni_setzero_ps() { + return _mm_setzero_ps(); +} + +static inline __m128 _mm_uni_set1_ps(float value) { + return _mm_set1_ps(value); +} + +static inline __m128 _mm_uni_add_ps(__m128 vec0, __m128 vec1) { + return _mm_add_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_sub_ps(__m128 vec0, __m128 vec1) { + return _mm_sub_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_mul_ps(__m128 vec0, __m128 vec1) { + return _mm_mul_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_div_ps(__m128 vec0, __m128 vec1) { + return _mm_div_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_sqrt_ps(__m128 vec) { + return _mm_sqrt_ps(vec); +} + +static inline __m128 _mm_uni_and_ps(__m128 vec0, __m128 vec1) { + return _mm_and_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_or_ps(__m128 vec0, __m128 vec1) { + return _mm_or_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_blendv_ps(__m128 vec0, __m128 vec1, __m128 vmask) { + return _mm_blendv_ps(vec0, vec1, vmask); +} + +static inline __m128 _mm_uni_min_ps(__m128 vec0, __m128 vec1) { + return _mm_min_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_max_ps(__m128 vec0, __m128 vec1) { + return _mm_max_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_floor_ps(__m128 vec) { + return _mm_floor_ps(vec); +} + +static inline __m128i _mm_uni_cvtps_epi32(__m128 vec) { + return _mm_cvtps_epi32(vec); +} + +static inline __m128i _mm_uni_add_epi32(__m128i vec0, __m128i vec1) { + return _mm_add_epi32(vec0, vec1); +} + +static inline __m128i _mm_uni_set1_epi32(int value) { + return _mm_set1_epi32(value); +} + +static inline __m128i _mm_uni_slli_epi32(__m128i vec, int value) { + return _mm_slli_epi32(vec, value); +} + +static inline __m128 _mm_uni_castsi_ps(__m128i vec) { + return _mm_castsi128_ps(vec); +} + +static inline __m128i _mm_uni_setzero_si() { + return _mm_setzero_si128(); +} + +static inline __m128 _mm_uni_cmpgt_ps(__m128 vec0, __m128 vec1) { + return _mm_cmpgt_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_cmpgt_i32(__m128i vec0, __m128i vec1) { + return _mm_cvtepi32_ps(_mm_cmpgt_epi32(vec0, vec1)); +} + +static inline __m128i _mm_uni_blendv_epi8(__m128i vec0, __m128i vec1, __m128i vmask) { + return _mm_blendv_epi8(vec0, vec1, vmask); +} + +static inline __m128i _mm_uni_castps_si(__m128 vec) { + return _mm_castps_si128(vec); +} - static inline __m128 _mm_uni_cvtepi32_ps(__m128i vec) { - return _mm_cvtepi32_ps(vec); - } - static inline int _mm_uni_movemask_ps(__m128 vec) { - return _mm_movemask_ps(vec); - } +static inline __m128 _mm_uni_cvtepi32_ps(__m128i vec) { + return _mm_cvtepi32_ps(vec); +} +static inline int _mm_uni_movemask_ps(__m128 vec) { + return _mm_movemask_ps(vec); +} } // namespace SSE42 #endif diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index a1ceabd6942db1..0d8b33d90fbd9c 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -4,11 +4,11 @@ #include "composite.h" -#include "nodes/input.h" #include "cpu_memory.h" +#include "nodes/input.h" +#include "shape_inference/shape_inference_internal_dyn.hpp" #include "transformations/cpu_opset/common/op/submodel.hpp" #include "utils/debug_capabilities.h" -#include "shape_inference/shape_inference_internal_dyn.hpp" namespace ov { namespace intel_cpu { @@ -75,7 +75,7 @@ void Composite::selectOptimalPrimitiveDescriptor() { // @todo add ascii diagramm for memory mapping / reuse void Composite::createPrimitive() { - OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.inputsNumber(), "Number of node inputs must be equal the number of inner graph's inputs"); std::vector inputMemory; @@ -83,7 +83,7 @@ void Composite::createPrimitive() { inputMemory.emplace_back(getSrcMemoryAtPort(i)); } - OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.outputsNumber(), "Number of node outputs must be equal the number of inner graph's outputs"); std::vector outputMemory; diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 635f37b2d05b3a..ef621947d723a7 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -4,29 +4,29 @@ #include "concat.h" -#include "openvino/op/concat.hpp" +#include +#include +#include +#include +#include #include #include #include -#include "dnnl_extension_utils.h" +#include "common/blocked_desc_creator.h" +#include "common/cpu_memcpy.h" +#include "dnnl_extension_utils.h" #include "onednn/dnnl.h" -#include -#include -#include #include "openvino/core/parallel.hpp" -#include "common/cpu_memcpy.h" -#include "common/blocked_desc_creator.h" -#include -#include +#include "openvino/op/concat.hpp" using namespace dnnl; namespace ov { namespace intel_cpu { namespace node { namespace { - constexpr size_t channelAxis = 1lu; +constexpr size_t channelAxis = 1lu; } bool Concat::isExecutable() const { @@ -86,11 +86,14 @@ void Concat::getSupportedDescriptors() { } } - // we need the first dims before axis to be 1 to avoid the reorder in the edge between the first parent and this concat + // we need the first dims before axis to be 1 to avoid the reorder in the edge between the first parent and this + // concat const auto& childDims = outputShapes[0].getDims(); if (childDims[axis] != Shape::UNDEFINED_DIM && - std::all_of(childDims.begin(), childDims.begin() + axis, [](size_t dim) { return dim == 1; })) + std::all_of(childDims.begin(), childDims.begin() + axis, [](size_t dim) { + return dim == 1; + })) canBeInPlace = true; } @@ -118,11 +121,11 @@ void Concat::initSupportedPrimitiveDescriptors() { const auto& dstShape = getOutputShapeAtPort(0); std::vector tdCreatorTypes = {LayoutType::ncsp, LayoutType::nspc}; - // check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow oneDNN ref implementation and allow - // inPlace memory usage if possible + // check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow + // oneDNN ref implementation and allow inPlace memory usage if possible if (dstShape.getRank() > channelAxis) { - for (auto& item : { std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c)}) { - const VectorDims &blkDims = dstShape.getDims(); + for (auto& item : {std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c)}) { + const VectorDims& blkDims = dstShape.getDims(); if (blkDims[channelAxis] == Shape::UNDEFINED_DIM || blkDims[channelAxis] % item.first != 0) continue; @@ -144,7 +147,8 @@ void Concat::initSupportedPrimitiveDescriptors() { auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - auto itrRange = BlockedDescCreator::makeFilteredRange(creatorsMap, static_cast(dstShape.getRank()), tdCreatorTypes); + auto itrRange = + BlockedDescCreator::makeFilteredRange(creatorsMap, static_cast(dstShape.getRank()), tdCreatorTypes); for (auto itr = itrRange.first; itr != itrRange.second; ++itr) { NodeConfig config; @@ -183,12 +187,15 @@ void Concat::initSupportedPrimitiveDescriptors() { } } - if (!canBeInPlace || std::any_of(inputShapes.begin(), inputShapes.end(), [](const Shape& shape) { return shape.hasZeroDims(); })) + if (!canBeInPlace || std::any_of(inputShapes.begin(), inputShapes.end(), [](const Shape& shape) { + return shape.hasZeroDims(); + })) return; // Optimized inplace case for (auto refPdIndex : pdIndexesToReuse) { - auto config = supportedPrimitiveDescriptors[refPdIndex].getConfig();; + auto config = supportedPrimitiveDescriptors[refPdIndex].getConfig(); + ; for (size_t i = 0; i < config.inConfs.size(); i++) { config.inConfs[i].inPlace(0); } @@ -204,12 +211,16 @@ void Concat::selectOptimalPrimitiveDescriptor() { // for that case. for (size_t i = 0; i < getParentEdges().size(); i++) { for (size_t j = i + 1; j < getParentEdges().size(); j++) { - if (getParentEdgeAt(i) == getParentEdgeAt(j)) canBeInPlace = false; + if (getParentEdgeAt(i) == getParentEdgeAt(j)) + canBeInPlace = false; } } std::map formatFrequency; - std::vector supportedLayouts = {LayoutType::ncsp, LayoutType::nspc, LayoutType::nCsp8c, LayoutType::nCsp16c}; + std::vector supportedLayouts = {LayoutType::ncsp, + LayoutType::nspc, + LayoutType::nCsp8c, + LayoutType::nCsp16c}; for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); auto parent = parentEdge->getParent(); @@ -218,11 +229,11 @@ void Concat::selectOptimalPrimitiveDescriptor() { if (parent_pdesc == nullptr) continue; - const auto &parent_config = parent_pdesc->getConfig(); + const auto& parent_config = parent_pdesc->getConfig(); int outputIndex = parentEdge->getInputNum(); if (outputIndex < 0 || outputIndex >= static_cast(parent_config.outConfs.size())) OPENVINO_THROW("Cannot find index of output node"); - const auto &port_desc = parent_config.outConfs[outputIndex].getMemDesc(); + const auto& port_desc = parent_config.outConfs[outputIndex].getMemDesc(); for (auto& item : supportedLayouts) { if (port_desc->hasLayoutType(item)) { formatFrequency[item] += 1; @@ -232,15 +243,15 @@ void Concat::selectOptimalPrimitiveDescriptor() { for (size_t i = 0; i < getChildEdges().size(); i++) { auto childEdge = getChildEdgeAt(i); auto child = childEdge->getChild(); - const auto *prim_desc = child->getSelectedPrimitiveDescriptor(); + const auto* prim_desc = child->getSelectedPrimitiveDescriptor(); if (prim_desc == nullptr) continue; - const auto &config = prim_desc->getConfig(); + const auto& config = prim_desc->getConfig(); int inputIndex = childEdge->getOutputNum(); if (inputIndex < 0 || inputIndex >= static_cast(config.inConfs.size())) OPENVINO_THROW("Cannot find index of output node"); - const auto &port_desc = config.inConfs[inputIndex].getMemDesc(); + const auto& port_desc = config.inConfs[inputIndex].getMemDesc(); for (auto& item : supportedLayouts) { if (port_desc->hasLayoutType(item)) { formatFrequency[item] += 1; @@ -249,9 +260,9 @@ void Concat::selectOptimalPrimitiveDescriptor() { } size_t maxCount = 0; - const auto &outDims = getOutputShapeAtPort(0).getDims(); + const auto& outDims = getOutputShapeAtPort(0).getDims(); LayoutType convertTo = LayoutType::ncsp; - for (auto &it : formatFrequency) { + for (auto& it : formatFrequency) { if (it.second > maxCount) { maxCount = it.second; convertTo = it.first; @@ -264,7 +275,7 @@ void Concat::selectOptimalPrimitiveDescriptor() { } } - for (auto& item : { std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c) }) { + for (auto& item : {std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c)}) { if (convertTo == item.second) { if (outDims[channelAxis] == Shape::UNDEFINED_DIM || outDims[1] % item.first != 0) { convertTo = LayoutType::ncsp; @@ -282,7 +293,8 @@ void Concat::selectOptimalPrimitiveDescriptor() { for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) { if (supportedPrimitiveDescriptors[i].getConfig().outConfs[0].getMemDesc()->hasLayoutType(convertTo)) { - if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, canBeInPlace)) { + if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, + canBeInPlace)) { canSelectPrimitive.push_back(i); } } @@ -444,24 +456,26 @@ void Concat::initOptimalPrimitiveDescriptor() { if (selected_pd == nullptr) OPENVINO_THROW("Preferable primitive descriptor is not set."); - if (!isInPlace()) { - Node::initOptimalPrimitiveDescriptor(); + if (!isInPlace()) { + Node::initOptimalPrimitiveDescriptor(); auto config = selected_pd->getConfig(); if (!isConfigDefined(config)) { for (size_t i = 0; i < config.inConfs.size(); i++) { // Concat doesn't support different precision on inputs - config.inConfs[i].setMemDesc(getConsistentInputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(inputPrecision)); + config.inConfs[i].setMemDesc( + getConsistentInputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(inputPrecision)); } for (size_t i = 0; i < config.outConfs.size(); i++) { - config.outConfs[i].setMemDesc(getConsistentOutputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(outputPrecision)); + config.outConfs[i].setMemDesc( + getConsistentOutputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(outputPrecision)); } initDescriptor(config); } } - //block layout may have axis greater than rank, disable ref_concat + // block layout may have axis greater than rank, disable ref_concat auto primDesc = getSelectedPrimitiveDescriptor(); auto memDesc = primDesc->getConfig().outConfs[0].getMemDesc()->as(); auto rank = memDesc->getShape().getRank(); @@ -474,7 +488,9 @@ void Concat::initOptimalPrimitiveDescriptor() { srcPtrs.resize(getParentEdges().size()); } // check if selected Tensor descriptor has nspc layout and concat axis is C - canOptimizeNspc = axis == channelAxis && getSelectedPrimitiveDescriptor()->getConfig().outConfs.front().getMemDesc()->hasLayoutType(LayoutType::nspc); + canOptimizeNspc = + axis == channelAxis && + getSelectedPrimitiveDescriptor()->getConfig().outConfs.front().getMemDesc()->hasLayoutType(LayoutType::nspc); } void Concat::execute(dnnl::stream strm) { @@ -497,7 +513,7 @@ void Concat::execute(dnnl::stream strm) { } else { const auto& dst_memory = getChildEdgeAt(0)->getMemory(); const size_t num_src = getParentEdges().size(); - std::unordered_map mem_ags {{DNNL_ARG_DST, dst_memory.getPrimitive()}}; + std::unordered_map mem_ags{{DNNL_ARG_DST, dst_memory.getPrimitive()}}; size_t nonZeroInShapes = 0; for (size_t i = 0; i < num_src; i++) { const auto& srcMem = getParentEdgeAt(i)->getMemory(); @@ -580,7 +596,7 @@ void Concat::execRef() { } if (!hasOuterLoop) { - if (nelemTotal < 64*1024 || parallel_get_max_threads() == 1) { + if (nelemTotal < 64 * 1024 || parallel_get_max_threads() == 1) { for (size_t a = 0; a < srcPtrs.size(); ++a) { const auto inData = srcPtrs[a]; auto outputData = &dstPtr[dstOffset[a]]; @@ -612,63 +628,65 @@ void Concat::execRef() { physDims[i] = outputShape[i]; } const auto L1Size = dnnl::utils::get_cache_size(1, true); - UNUSED(L1Size); // for Windows - parallel_for6d(physDims[0], physDims[1], physDims[2], physDims[3], physDims[4], numSrc, - [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) { - // check if zero memory - if (srcPtrs[a] == nullptr) return; - - size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 - + inputStrides[a][3] * n3 + inputStrides[a][4] * n4; - size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 - + outputStrides[3] * n3 + outputStrides[4] * n4; - const uint8_t *i = &srcPtrs[a][inOff]; - uint8_t *o = &dstPtr[dstOffset[a] + outOff]; + UNUSED(L1Size); // for Windows + parallel_for6d(physDims[0], + physDims[1], + physDims[2], + physDims[3], + physDims[4], + numSrc, + [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) { + // check if zero memory + if (srcPtrs[a] == nullptr) + return; + + size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 + + inputStrides[a][3] * n3 + inputStrides[a][4] * n4; + size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 + + outputStrides[3] * n3 + outputStrides[4] * n4; + const uint8_t* i = &srcPtrs[a][inOff]; + uint8_t* o = &dstPtr[dstOffset[a] + outOff]; #if defined(__GNUC__) - // Heuristic: - // memcpy works generally faster for data sizes not - // exceeding L1 cache. - if (nelemToCopy[a] > L1Size) { - // The code below performs data copying: o[e] = i[e] - // and uses a workaround to make GNU compilers optimize it - uint8_t *ptro = o; - const uint8_t *ptri = i; - // head part: bytes before 4 byte-align's address - const size_t headPart = sizeof(uint32_t) - - reinterpret_cast(ptro) - % sizeof(uint32_t); - - // main part: bytes in 4 byte-align - const size_t mainPart - = (nelemToCopy[a] - headPart) / sizeof(uint32_t); - // tail part: bytes after 4 byte-align - const size_t tailPart - = (nelemToCopy[a]) - headPart - - (mainPart * sizeof(uint32_t)); - // copy head part - for (size_t e = 0; e < headPart; ++e) { - *ptro = *ptri; - ++ptro; - ++ptri; - } - // copy main part - std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t)); - ptro += mainPart * sizeof(uint32_t); - ptri += mainPart * sizeof(uint32_t); - // copy tail part - for (size_t e = 0; e < tailPart; ++e) { - *ptro = *ptri; - ++ptro; - ++ptri; - } - } else { - std::memcpy(o, i, nelemToCopy[a]); - } + // Heuristic: + // memcpy works generally faster for data sizes not + // exceeding L1 cache. + if (nelemToCopy[a] > L1Size) { + // The code below performs data copying: o[e] = i[e] + // and uses a workaround to make GNU compilers optimize it + uint8_t* ptro = o; + const uint8_t* ptri = i; + // head part: bytes before 4 byte-align's address + const size_t headPart = + sizeof(uint32_t) - reinterpret_cast(ptro) % sizeof(uint32_t); + + // main part: bytes in 4 byte-align + const size_t mainPart = (nelemToCopy[a] - headPart) / sizeof(uint32_t); + // tail part: bytes after 4 byte-align + const size_t tailPart = (nelemToCopy[a]) - headPart - (mainPart * sizeof(uint32_t)); + // copy head part + for (size_t e = 0; e < headPart; ++e) { + *ptro = *ptri; + ++ptro; + ++ptri; + } + // copy main part + std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t)); + ptro += mainPart * sizeof(uint32_t); + ptri += mainPart * sizeof(uint32_t); + // copy tail part + for (size_t e = 0; e < tailPart; ++e) { + *ptro = *ptri; + ++ptro; + ++ptri; + } + } else { + std::memcpy(o, i, nelemToCopy[a]); + } #else std::memcpy(o, i, nelemToCopy[a]); #endif - }); + }); } } @@ -691,8 +709,10 @@ void Concat::resolveInPlaceEdges(Edge::LOOK look) { " can't use inPlace memory with concatenation on dynamic dimension"); auto edges = getChildEdgesAtPort(inplaceOutIndx); - auto itr = std::find_if(edges.begin(), edges.end(), [](const EdgePtr& edge) { return edge->getStatus() == Edge::Status::Allocated; }); - OPENVINO_ASSERT(itr != edges.end(), " Could not find allocated child edge for concat node: " , getName()); + auto itr = std::find_if(edges.begin(), edges.end(), [](const EdgePtr& edge) { + return edge->getStatus() == Edge::Status::Allocated; + }); + OPENVINO_ASSERT(itr != edges.end(), " Could not find allocated child edge for concat node: ", getName()); auto baseMemBlock = (*itr)->getMemory().getMemoryBlock(); OPENVINO_ASSERT(baseMemBlock != nullptr, " NULL base memory block in concat node: ", getName()); @@ -726,6 +746,6 @@ void Concat::resolveInPlaceEdges(Edge::LOOK look) { } } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index 9ed331bee4f16d..8b75e3839a372d 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -4,8 +4,8 @@ #pragma once -#include "node.h" #include "graph_context.h" +#include "node.h" namespace ov { namespace intel_cpu { @@ -22,7 +22,9 @@ class Concat : public Node { void selectOptimalPrimitiveDescriptor() override; bool created() const override; void execute(dnnl::stream strm) override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } void resolveInPlaceEdges(Edge::LOOK look) override; ov::element::Type getRuntimePrecision() const override; @@ -42,9 +44,9 @@ class Concat : public Node { void execNspcSpecCase(); void exec1DCase(); std::vector inputStrides; - std::vector nelemToCopy; // byte moved in each iter + std::vector nelemToCopy; // byte moved in each iter size_t nelemTotal = 0; - std::vector dstOffset; // dst offset for each input + std::vector dstOffset; // dst offset for each input std::vector srcPtrs; bool hasOuterLoop = false; ov::element::Type inputPrecision = ov::element::f32; @@ -54,6 +56,6 @@ class Concat : public Node { dnnl::primitive prim; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 7cf7698e989343..4cb2dc9058551f 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -4,8 +4,11 @@ #include "conv.h" -#include "openvino/op/convolution.hpp" -#include "openvino/op/group_conv.hpp" +#include +#include +#include +#include + #include "common/c_types_map.hpp" #include "common/cpu_convert.h" #include "common/primitive_desc.hpp" @@ -27,17 +30,14 @@ #include "oneapi/dnnl/dnnl_common.hpp" #include "oneapi/dnnl/dnnl_types.h" #include "onednn/dnnl.h" +#include "openvino/op/convolution.hpp" +#include "openvino/op/group_conv.hpp" #include "pooling.h" #include "reorder.h" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" -#include -#include -#include -#include - using namespace dnnl; namespace ov { @@ -88,7 +88,7 @@ size_t ConvKey::hash() const { return seed; } -bool ConvKey::operator==(const ConvKey &rhs) const { +bool ConvKey::operator==(const ConvKey& rhs) const { bool retVal = true; if (inp0 != rhs.inp0) { retVal = retVal && inp0 && rhs.inp0 && inp0->getDnnlDesc() == rhs.inp0->getDnnlDesc(); @@ -112,11 +112,11 @@ bool ConvKey::operator==(const ConvKey &rhs) const { return retVal; } -} // namespace +} // namespace class Convolution::FusedSubgraph { public: - FusedSubgraph(const std::vector &opList, const Convolution &conv, const GraphContext::CPtr context) { + FusedSubgraph(const std::vector& opList, const Convolution& conv, const GraphContext::CPtr context) { _graph = std::unique_ptr(new Graph()); std::unordered_set nodesSet; @@ -130,16 +130,16 @@ class Convolution::FusedSubgraph { nodesSet.insert(child); }; - //Make inputs - const auto &inpMemDesc1 = conv.getBaseMemDescAtOutputPort(0); + // Make inputs + const auto& inpMemDesc1 = conv.getBaseMemDescAtOutputPort(0); auto inp0 = std::make_shared(inpMemDesc1, "inp0", "Parameter", context); inputs.push_back(inp0); const size_t sumPortNum = conv.getParentEdges().size() - 1; - const auto &inpMemDesc2 = conv.getBaseMemDescAtInputPort(sumPortNum); + const auto& inpMemDesc2 = conv.getBaseMemDescAtInputPort(sumPortNum); auto inp1 = std::make_shared(inpMemDesc2, "inp1", "Parameter", context); inputs.push_back(inp1); - auto itr = std::find_if(opList.begin(), opList.end(), [](const NodePtr &node) { + auto itr = std::find_if(opList.begin(), opList.end(), [](const NodePtr& node) { if (auto eltwise = std::dynamic_pointer_cast(node)) { return eltwise->isSpecialConvolutionAddFusing(); } @@ -153,7 +153,7 @@ class Convolution::FusedSubgraph { addEdge(inp0, sumNode, 0, 0); addEdge(inp1, sumNode, 0, 1); - //Replicate the rest of the subgraph + // Replicate the rest of the subgraph auto parentItr = itr; while (++itr != opList.end()) { auto parentNode = *parentItr; @@ -173,8 +173,8 @@ class Convolution::FusedSubgraph { } } - //Make output - const auto &outMemDesc = conv.getBaseMemDescAtOutputPort(0); + // Make output + const auto& outMemDesc = conv.getBaseMemDescAtOutputPort(0); auto out = std::make_shared(outMemDesc, "out", "Result", context); addEdge(*parentItr, out, 0, 0); outputs.push_back(out); @@ -240,9 +240,20 @@ bool Convolution::isSupportedOperation(const std::shared_ptr& op } Convolution::Convolution(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, NgraphShapeInferFactory(op)), withBiases(false), withSum(false), withDWConv(false), - isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::undef), - groupNum(1lu), IC(1), groupIC(1), groupOC(1), eltwisePrecision(ov::element::f32) { + : Node(op, context, NgraphShapeInferFactory(op)), + withBiases(false), + withSum(false), + withDWConv(false), + isGrouped(false), + dw_conv_oc(0), + dw_conv_ih(0), + dw_conv_iw(0), + dw_conv_in_dt(memory::data_type::undef), + groupNum(1lu), + IC(1), + groupIC(1), + groupOC(1), + eltwisePrecision(ov::element::f32) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -296,12 +307,12 @@ Convolution::Convolution(const std::shared_ptr& op, const GraphContext } paddingL = groupConvolutionOp->get_pads_begin(); paddingR = groupConvolutionOp->get_pads_end(); - autoPadding = one_of(groupConvolutionOp->get_auto_pad(), ov::op::PadType::SAME_UPPER, ov::op::PadType::SAME_LOWER); + autoPadding = + one_of(groupConvolutionOp->get_auto_pad(), ov::op::PadType::SAME_UPPER, ov::op::PadType::SAME_LOWER); } // Only apply this heuristic logic on FP32 IR. IC=1 ,OC=1 would disable brgconv on avx2. const bool isAvx2FP32 = !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) && - dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && - !context->isGraphQuantized(); + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && !context->isGraphQuantized(); useJitPlanar = ((IC == 1 && groupOC * groupNum == 1) && isAvx2FP32); } @@ -315,7 +326,8 @@ bool Convolution::canBeExecutedInInt8() const { if (!legacyWeightsZeroPoints.empty()) weightsDataType = memory::data_type::s8; - return one_of(inputDataType, memory::data_type::u8, memory::data_type::s8) && weightsDataType == memory::data_type::s8; + return one_of(inputDataType, memory::data_type::u8, memory::data_type::s8) && + weightsDataType == memory::data_type::s8; } ov::element::Type Convolution::fusedEltwisePrecision(const NodePtr& fusingNode) const { @@ -338,60 +350,63 @@ ov::element::Type Convolution::fusedEltwisePrecision(const NodePtr& fusingNode) const std::vector& Convolution::getDefaultImplPriority() { static const std::vector priorities = { - impl_desc_type::unknown, - impl_desc_type::dw_acl, - impl_desc_type::winograd_acl, - impl_desc_type::gemm_acl, - impl_desc_type::acl, - impl_desc_type::brgconv_avx512_amx_1x1, - impl_desc_type::brgconv_avx512_amx, - impl_desc_type::jit_avx512_amx_dw, - impl_desc_type::jit_avx512_amx_1x1, - impl_desc_type::jit_avx512_amx, - impl_desc_type::brgconv_avx512_1x1, - impl_desc_type::brgconv_avx512, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::brgconv_avx2_1x1, - impl_desc_type::brgconv_avx2, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, - impl_desc_type::gemm_any, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::jit_gemm, - impl_desc_type::ref_any, - impl_desc_type::ref, - }; - if (isBrgConvAvailable()) - return priorities; - - static const std::vector priorities_wo_brgemm = [&] { - std::vectorresult; - std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), - [](impl_desc_type type) { return !(type & impl_desc_type::brgconv); }); - return result;}(); - return priorities_wo_brgemm; + impl_desc_type::unknown, + impl_desc_type::dw_acl, + impl_desc_type::winograd_acl, + impl_desc_type::gemm_acl, + impl_desc_type::acl, + impl_desc_type::brgconv_avx512_dw, + impl_desc_type::brgconv_avx512_amx_1x1, + impl_desc_type::brgconv_avx512_amx, + impl_desc_type::jit_avx512_amx_dw, + impl_desc_type::jit_avx512_amx_1x1, + impl_desc_type::jit_avx512_amx, + impl_desc_type::brgconv_avx512_1x1, + impl_desc_type::brgconv_avx512, + impl_desc_type::jit_avx512_dw, + impl_desc_type::jit_avx512_1x1, + impl_desc_type::jit_avx512, + impl_desc_type::brgconv_avx2_dw, + impl_desc_type::brgconv_avx2_1x1, + impl_desc_type::brgconv_avx2, + impl_desc_type::jit_uni_dw, + impl_desc_type::jit_uni_1x1, + impl_desc_type::jit_uni, + impl_desc_type::jit_avx2_dw, + impl_desc_type::jit_avx2_1x1, + impl_desc_type::jit_avx2, + impl_desc_type::jit_avx_dw, + impl_desc_type::jit_avx_1x1, + impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, + impl_desc_type::jit_sse42_1x1, + impl_desc_type::jit_sse42, + impl_desc_type::gemm_any, + impl_desc_type::gemm_blas, + impl_desc_type::gemm_avx512, + impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, + impl_desc_type::gemm_sse42, + impl_desc_type::jit_gemm, + impl_desc_type::ref_any, + impl_desc_type::ref, + }; + if (isBrgConvAvailable()) + return priorities; + + static const std::vector priorities_wo_brgemm = [&] { + std::vector result; + std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), [](impl_desc_type type) { + return !(type & impl_desc_type::brgconv); + }); + return result; + }(); + return priorities_wo_brgemm; } const bool Convolution::isBrgConvAvailable() { - //When avx2 brgconv heuristic case, disable brgconv to WA the regression. - const bool isBrgConvAvailable = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && - !useJitPlanar; + // When avx2 brgconv heuristic case, disable brgconv to WA the regression. + const bool isBrgConvAvailable = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && !useJitPlanar; return isBrgConvAvailable; } @@ -411,7 +426,7 @@ void Convolution::getSupportedDescriptors() { } if (fusedWith[i]->getAlgorithm() == Algorithm::EltwiseAdd) { - auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { expectedInputEdgesNum++; } @@ -425,17 +440,19 @@ void Convolution::getSupportedDescriptors() { outputDataType = DnnlExtensionUtils::ElementTypeToDataType(getOriginalOutputPrecisionAtPort(0)); eltwisePrecision = DnnlExtensionUtils::DataTypeToElementType(outputDataType); if (!fusedWith.empty()) { - outputDataType = DnnlExtensionUtils::ElementTypeToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); + outputDataType = DnnlExtensionUtils::ElementTypeToDataType( + fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); eltwisePrecision = DnnlExtensionUtils::DataTypeToElementType(outputDataType); } // We need to make sure that convolution output and second input of fused Eltwise operation - // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32. + // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale + // to FP32. if (outputDataType != memory::data_type::f32 && outputDataType != memory::data_type::bf16 && outputDataType != memory::data_type::f16 && withSum) { for (size_t i = 0; i < fusedWith.size(); i++) { if (fusedWith[i]->getAlgorithm() == Algorithm::EltwiseAdd) { - auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { eltwisePrecision = fusedEltwisePrecision(fusedWith[i]); if (DnnlExtensionUtils::DataTypeToElementType(outputDataType).size() != eltwisePrecision.size()) { @@ -466,7 +483,7 @@ void Convolution::getSupportedDescriptors() { } for (size_t i = 0; i < fusedWith.size(); i++) { - auto *convolutionNode = dynamic_cast(fusedWith[i].get()); + auto* convolutionNode = dynamic_cast(fusedWith[i].get()); if (convolutionNode) { auto& inActivationDims = convolutionNode->inputShapes[0].getStaticDims(); dw_conv_ih = inActivationDims[convolutionNode->inputShapes[0].getRank() - 2]; @@ -475,7 +492,7 @@ void Convolution::getSupportedDescriptors() { auto& outDims = convolutionNode->outputShapes[0].getStaticDims(); dw_conv_oc = outDims[1]; - const auto &dwWeightsDims = convolutionNode->inputShapes[1].getStaticDims(); + const auto& dwWeightsDims = convolutionNode->inputShapes[1].getStaticDims(); dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 1]); dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 2]); dw_conv_strides = convolutionNode->getStride(); @@ -484,7 +501,8 @@ void Convolution::getSupportedDescriptors() { if (i == 0) { dw_conv_in_dt = DnnlExtensionUtils::ElementTypeToDataType(getOriginalOutputPrecisionAtPort(0)); } else { - dw_conv_in_dt = DnnlExtensionUtils::ElementTypeToDataType(fusedWith[i - 1]->getOriginalOutputPrecisionAtPort(0)); + dw_conv_in_dt = DnnlExtensionUtils::ElementTypeToDataType( + fusedWith[i - 1]->getOriginalOutputPrecisionAtPort(0)); } } else { dw_conv_in_dt = memory::data_type::f32; @@ -496,7 +514,7 @@ void Convolution::getSupportedDescriptors() { int src = getInputShapeAtPort(0).getStaticDims()[2 + j]; int dst = getOutputShapeAtPort(0).getStaticDims()[2 + j]; - krn = (krn - 1)*(dilation[j] + 1) + 1; + krn = (krn - 1) * (dilation[j] + 1) + 1; int calc_dst = (src - krn + paddingL[j]) / stride[j] + 1; paddingR[j] = (dst - calc_dst) * stride[j]; } @@ -504,10 +522,14 @@ void Convolution::getSupportedDescriptors() { } MemoryDescPtr in_candidate, out_candidate; - memory::format_tag nspc = ndims == 3 ? memory::format_tag::nwc : (ndims == 4 ? memory::format_tag::nhwc : memory::format_tag::ndhwc); - memory::format_tag ncsp = ndims == 3 ? memory::format_tag::ncw : (ndims == 4 ? memory::format_tag::nchw : memory::format_tag::ncdhw); - memory::format_tag nCsp8c = ndims == 3 ? memory::format_tag::nCw8c : (ndims == 4 ? memory::format_tag::nChw8c : memory::format_tag::nCdhw8c); - memory::format_tag nCsp16c = ndims == 3 ? memory::format_tag::nCw16c : (ndims == 4 ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c); + memory::format_tag nspc = + ndims == 3 ? memory::format_tag::nwc : (ndims == 4 ? memory::format_tag::nhwc : memory::format_tag::ndhwc); + memory::format_tag ncsp = + ndims == 3 ? memory::format_tag::ncw : (ndims == 4 ? memory::format_tag::nchw : memory::format_tag::ncdhw); + memory::format_tag nCsp8c = ndims == 3 ? memory::format_tag::nCw8c + : (ndims == 4 ? memory::format_tag::nChw8c : memory::format_tag::nCdhw8c); + memory::format_tag nCsp16c = ndims == 3 ? memory::format_tag::nCw16c + : (ndims == 4 ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c); if (canBeExecutedInInt8()) { DEBUG_LOG(getName(), "Creating I8 descriptor"); @@ -522,7 +544,7 @@ void Convolution::getSupportedDescriptors() { in_candidate = std::make_shared(getInputShapeAtPort(0), inputDataType, nspc); out_candidate = std::make_shared(getOutputShapeAtPort(0), outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); return; } @@ -547,7 +569,7 @@ void Convolution::getSupportedDescriptors() { eltwisePrecision = ov::element::f32; for (size_t i = 0; i < fusedWith.size(); i++) { if (fusedWith[i]->getAlgorithm() == Algorithm::EltwiseAdd) { - auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { eltwisePrecision = fusedEltwisePrecision(fusedWith[i]); // TODO(amalyshe): there might be situation when convolution can be executed in BF16, @@ -579,42 +601,44 @@ void Convolution::getSupportedDescriptors() { #if defined(OPENVINO_ARCH_X86_64) // nspc shows better performance only with brgconv implementation - bool nspcFirst = isBrgConvAvailable() && one_of(inputDataType, memory::data_type::f16, memory::data_type::bf16, memory::data_type::f32); + bool nspcFirst = isBrgConvAvailable() && + one_of(inputDataType, memory::data_type::f16, memory::data_type::bf16, memory::data_type::f32); bool nspcAdded = false; if (nspcFirst) { in_candidate = std::make_shared(inputShape, inputDataType, nspc); out_candidate = std::make_shared(outputShape, outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); nspcAdded = true; } if (IC == 1 && groupOC == 1) { in_candidate = std::make_shared(inputShape, inputDataType, ncsp); out_candidate = std::make_shared(outputShape, outputDataType, ncsp); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } else if (IC < 4) { in_candidate = std::make_shared(inputShape, inputDataType, ncsp); out_candidate = std::make_shared(outputShape, outputDataType, nCsp16c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); out_candidate = std::make_shared(outputShape, outputDataType, nCsp8c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } else { in_candidate = std::make_shared(inputShape, inputDataType, nCsp16c); out_candidate = std::make_shared(outputShape, outputDataType, nCsp16c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); in_candidate = std::make_shared(inputShape, inputDataType, nCsp8c); out_candidate = std::make_shared(outputShape, outputDataType, nCsp8c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } in_candidate = std::make_shared(inputShape, inputDataType, ncsp); out_candidate = std::make_shared(outputShape, outputDataType, ncsp); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); - if (!nspcAdded && (inputDataType != memory::data_type::bf16 && inputDataType != memory::data_type::f16 && isNspcAvailable())) { + if (!nspcAdded && + (inputDataType != memory::data_type::bf16 && inputDataType != memory::data_type::f16 && isNspcAvailable())) { in_candidate = std::make_shared(inputShape, inputDataType, nspc); out_candidate = std::make_shared(outputShape, outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } #else (void)ncsp; @@ -623,7 +647,7 @@ void Convolution::getSupportedDescriptors() { in_candidate = std::make_shared(inputShape, inputDataType, nspc); out_candidate = std::make_shared(outputShape, outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); #endif } @@ -634,9 +658,11 @@ void Convolution::setPostOps(dnnl::primitive_attr& attr, dnnl::post_ops ops; auto& args = convPostOpsArgs[useLegacyPostOps]; bool isINT8 = canBeExecutedInInt8(); - // Weight dims in NON-Group CONV: [OC, IC, KH, KW], perchannel weight scale applied on OC DIM, weiScaleMaskPerChannel = 1 << 0 - // Weight dims in Group CONV:[Group, OC, IC, KH, KW], perchannel weight scale applied on GROUP and OC DIM, weiScaleMaskPerChannel = ( 1 << 0 | 1<< 1) = 0x03 - DnnlPostOpsComposerLegacy dnnlpoc(getEngine(), attr, ops, args, dims, 1, isINT8, isGrouped ? 3 : 1 << 0, getDQScales(), withBiases); + // Weight dims in NON-Group CONV: [OC, IC, KH, KW], perchannel weight scale applied on OC DIM, + // weiScaleMaskPerChannel = 1 << 0 Weight dims in Group CONV:[Group, OC, IC, KH, KW], perchannel weight scale + // applied on GROUP and OC DIM, weiScaleMaskPerChannel = ( 1 << 0 | 1<< 1) = 0x03 + DnnlPostOpsComposerLegacy + dnnlpoc(getEngine(), attr, ops, args, dims, 1, isINT8, isGrouped ? 3 : 1 << 0, getDQScales(), withBiases); DEBUG_LOG(getName(), " useLegacyPostOps=", useLegacyPostOps, " initWeights=", initWeights); @@ -679,14 +705,14 @@ void Convolution::setPostOps(dnnl::primitive_attr& attr, bool hasSubsequentSum = false; bool hasSubsequentFQ = false; for (size_t j = i + 1; j < fusedWith.size(); j++) { - auto &nextNode = fusedWith[j]; + auto& nextNode = fusedWith[j]; - auto *nextEltwiseNode = dynamic_cast(nextNode.get()); + auto* nextEltwiseNode = dynamic_cast(nextNode.get()); if (nextEltwiseNode && nextEltwiseNode->isSpecialConvolutionAddFusing()) { hasSubsequentSum = true; } - auto *nextQuantizeNode = dynamic_cast(nextNode.get()); + auto* nextQuantizeNode = dynamic_cast(nextNode.get()); if (nextQuantizeNode) { hasSubsequentFQ = true; } @@ -779,12 +805,16 @@ void Convolution::initSupportedPrimitiveDescriptors() { const std::vector dwWeightsDims{dw_conv_oc, 1, 1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]}; const std::vector dwBiasesDims{dw_conv_oc}; - const auto dwWeightsPrc = DnnlExtensionUtils::ElementTypeToDataType(dw_conv_in_dt == dnnl_u8 ? ov::element::i8 : ov::element::f32); - const auto dwWeightsDesc = std::make_shared(Shape(dwWeightsDims), dwWeightsPrc, memory::format_tag::Goihw8g); + const auto dwWeightsPrc = DnnlExtensionUtils::ElementTypeToDataType( + dw_conv_in_dt == dnnl_u8 ? ov::element::i8 : ov::element::f32); + const auto dwWeightsDesc = std::make_shared(Shape(dwWeightsDims), + dwWeightsPrc, + memory::format_tag::Goihw8g); inConfs.emplace_back(dwWeightsDesc); const auto dwBiasPrc = memory::data_type::f32; - const auto dwBiasDesc = std::make_shared(Shape(dwBiasesDims), dwBiasPrc, memory::format_tag::x); + const auto dwBiasDesc = + std::make_shared(Shape(dwBiasesDims), dwBiasPrc, memory::format_tag::x); inConfs.emplace_back(dwBiasDesc); } @@ -807,15 +837,25 @@ void Convolution::initSupportedPrimitiveDescriptors() { }; #ifdef CPU_DEBUG_CAPS { - if (!customImplPriorities.empty()) { - DEBUG_LOG("#", getName(), " customImplPriorities [", 0 , "/", customImplPriorities.size(), - "]: ", impl_type_to_string(customImplPriorities[0])); - } + if (!customImplPriorities.empty()) { + DEBUG_LOG("#", + getName(), + " customImplPriorities [", + 0, + "/", + customImplPriorities.size(), + "]: ", + impl_type_to_string(customImplPriorities[0])); + } } #endif for (size_t dIdx = 0; dIdx < descs.size(); dIdx++) { auto& desc = descs[dIdx]; - auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); + auto primitive_desc = desc.get(true); // true mean allow empty + if (primitive_desc == nullptr) { + continue; + } + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(primitive_desc)); auto add_supported_desc = [&](dnnl::primitive_desc& desc) { addSupportedPrimitiveDescriptor(desc); @@ -823,16 +863,25 @@ void Convolution::initSupportedPrimitiveDescriptors() { }; const bool first_match = customImplPriorities.empty(); - DEBUG_LOG("#", getName(), - ", itpd.impl_info_str(): ", desc.impl_info_str(), - ", parsed imp_type: ", impl_type_to_string(parse_impl_name(desc.impl_info_str())), - ", first_match: ", first_match ? "true" : "false"); - DnnlExtensionUtils::for_each_implementation(desc, - first_match, - [&](impl_desc_type implType) { - return contains(getImplPriority(), implType); - }, - add_supported_desc); + DEBUG_LOG("#", + getName(), + ",descIndex:", + dIdx + 1, + "/", + descs.size(), + ", itpd.impl_info_str(): ", + desc.impl_info_str(), + ", parsed imp_type: ", + impl_type_to_string(parse_impl_name(desc.impl_info_str())), + ", first_match: ", + first_match ? "true" : "false"); + DnnlExtensionUtils::for_each_implementation( + desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + add_supported_desc); // fallback. if none of the primitive types is present in the priority list just add first implementation // @todo this fallback is not necessary if primitive priority list is filled correctly @@ -846,46 +895,48 @@ bool Convolution::created() const { } namespace { -dnnl::convolution_forward::primitive_desc -createDescriptorInternal(const dnnl::engine& engine, - const dnnl::memory::desc& inputDesc, - const dnnl::memory::desc& weightDesc, - const dnnl::memory::desc& biasDesc, - const dnnl::memory::desc& outputDesc, - bool withBiases, - const std::vector& stride, - const std::vector& dilation, - const std::vector& paddingL, - const std::vector& paddingR, - dnnl::algorithm alg, - const dnnl::primitive_attr& attr) { +dnnl::convolution_forward::primitive_desc createDescriptorInternal(const dnnl::engine& engine, + const dnnl::memory::desc& inputDesc, + const dnnl::memory::desc& weightDesc, + const dnnl::memory::desc& biasDesc, + const dnnl::memory::desc& outputDesc, + bool withBiases, + const std::vector& stride, + const std::vector& dilation, + const std::vector& paddingL, + const std::vector& paddingR, + dnnl::algorithm alg, + const dnnl::primitive_attr& attr) { if (withBiases) { - return dnnl::convolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - alg, - inputDesc, weightDesc, biasDesc, outputDesc, - dnnl::memory::dims(stride.begin(), stride.end()), - dnnl::memory::dims(dilation.begin(), dilation.end()), - dnnl::memory::dims(paddingL.begin(), paddingL.end()), - dnnl::memory::dims(paddingR.begin(), paddingR.end()), - attr, - true); // allow_empty + return dnnl::convolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + alg, + inputDesc, + weightDesc, + biasDesc, + outputDesc, + dnnl::memory::dims(stride.begin(), stride.end()), + dnnl::memory::dims(dilation.begin(), dilation.end()), + dnnl::memory::dims(paddingL.begin(), paddingL.end()), + dnnl::memory::dims(paddingR.begin(), paddingR.end()), + attr, + true); // allow_empty } else { - return dnnl::convolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - alg, - inputDesc, weightDesc, outputDesc, - dnnl::memory::dims(stride.begin(), stride.end()), - dnnl::memory::dims(dilation.begin(), dilation.end()), - dnnl::memory::dims(paddingL.begin(), paddingL.end()), - dnnl::memory::dims(paddingR.begin(), paddingR.end()), - attr, - true); // allow_empty + return dnnl::convolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + alg, + inputDesc, + weightDesc, + outputDesc, + dnnl::memory::dims(stride.begin(), stride.end()), + dnnl::memory::dims(dilation.begin(), dilation.end()), + dnnl::memory::dims(paddingL.begin(), paddingL.end()), + dnnl::memory::dims(paddingR.begin(), paddingR.end()), + attr, + true); // allow_empty } } -} // namespace +} // namespace static memory::data_type deriveWeightDataType(memory::data_type src_dt) { memory::data_type wdt = src_dt; @@ -910,7 +961,7 @@ void Convolution::createDescriptor(const std::vector& inputDesc, if (outputDesc[0]->isDefined()) { definedOutMemDesc = MemoryDescUtils::convertToDnnlMemoryDesc(outputDesc[0]); } else { - std::vector shapes = { definedInpMemDesc->getShape(), Shape(weightDims) }; + std::vector shapes = {definedInpMemDesc->getShape(), Shape(weightDims)}; auto outDims = shapeInferGeneric(shapes); definedOutMemDesc = MemoryDescUtils::convertToDnnlMemoryDesc(outputDesc[0]->cloneWithNewDims(outDims.front())); } @@ -924,13 +975,14 @@ void Convolution::createDescriptor(const std::vector& inputDesc, dnnl::memory::desc biasDnnlDesc; if (withBiases) { - //oneDNN ARM Convolution primitive supports only identical in/out data types + // oneDNN ARM Convolution primitive supports only identical in/out data types #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) memory::data_type bdt = outDnnlDesc.get_data_type(); #else memory::data_type bdt = memory::data_type::f32; #endif - biasDnnlDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); + biasDnnlDesc = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); } std::vector algorithms; @@ -942,10 +994,18 @@ void Convolution::createDescriptor(const std::vector& inputDesc, for (const auto alg : algorithms) { for (const auto& attr : attrs) { const auto desc = createDescriptorInternal(getEngine(), - inDnnlDesc, weightDnnlDesc, biasDnnlDesc, outDnnlDesc, withBiases, - stride, dilation, paddingL, paddingR, alg, attr); - if (desc) - descs.emplace_back(desc); + inDnnlDesc, + weightDnnlDesc, + biasDnnlDesc, + outDnnlDesc, + withBiases, + stride, + dilation, + paddingL, + paddingR, + alg, + attr); + descs.emplace_back(desc); } } } @@ -978,7 +1038,8 @@ void Convolution::addLegacyZeroPoints(dnnl::primitive_attr& attr) { if (!legacyWeightsZeroPointsMemPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::f32, {legacyWeightsZeroPoints.size()}); - legacyWeightsZeroPointsMemPtr = std::make_shared(getEngine(), memoryDesc, legacyWeightsZeroPoints.data()); + legacyWeightsZeroPointsMemPtr = + std::make_shared(getEngine(), memoryDesc, legacyWeightsZeroPoints.data()); } } @@ -988,7 +1049,8 @@ void Convolution::addLegacyZeroPoints(dnnl::primitive_attr& attr) { if (!legacyOutputCompensationMemPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::i32, {legacyOutputCompensation.size()}); - legacyOutputCompensationMemPtr = std::make_shared(getEngine(), memoryDesc, legacyOutputCompensation.data()); + legacyOutputCompensationMemPtr = + std::make_shared(getEngine(), memoryDesc, legacyOutputCompensation.data()); } } } @@ -999,7 +1061,7 @@ static bool attrContainsPostOp(const dnnl::primitive_attr& attr, const dnnl::imp } // See the src/plugins/intel_cpu/src/docs/convPostOps.md for details -void Convolution::SetPostOpsAndZeroPoints(std::vector &attrs) { +void Convolution::SetPostOpsAndZeroPoints(std::vector& attrs) { attrs.resize(1); auto outputShape = outputStaticShape(); // attr[0] - Legacy post ops + Legacy zero points. @@ -1007,14 +1069,13 @@ void Convolution::SetPostOpsAndZeroPoints(std::vector &att setPostOps(attrs[0], outputShape, true); addLegacyZeroPoints(attrs[0]); - //dw-conv would be fused into conv only on AVX2 platform. no need attr[1]. Avoid extra useless attribute. + // dw-conv would be fused into conv only on AVX2 platform. no need attr[1]. Avoid extra useless attribute. if (attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::convolution)) { return; } // no matter if brgconv is available, 1 attribute is enough. Avoid duplicated attribute - if (inputZeroPointType == zpType::None && - !attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::depthwise) && + if (inputZeroPointType == zpType::None && !attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::depthwise) && !attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::quantization)) { return; } @@ -1029,10 +1090,11 @@ void Convolution::SetPostOpsAndZeroPoints(std::vector &att } // Try 2 attributes. attrs.resize(2); - if (inputZeroPointType == zpType::PerTensor && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { - //WR to ONEDNN limitation. attr[1] - legacy post ops + stock zero point. + if (inputZeroPointType == zpType::PerTensor && + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + // WR to ONEDNN limitation. attr[1] - legacy post ops + stock zero point. //@todo:Unify to use binary postops+stock zero point when limitation is fixed. - //For now, have to adapt to JIT_AMX kernel for performance. + // For now, have to adapt to JIT_AMX kernel for performance. DEBUG_LOG(getName(), ": set post ops, attr 1, useLegacyPostOps=true"); setPostOps(attrs[1], outputShape, true); } else { @@ -1043,7 +1105,7 @@ void Convolution::SetPostOpsAndZeroPoints(std::vector &att } void Convolution::initDescriptor(const NodeConfig& config) { - auto *selectedPD = getSelectedPrimitiveDescriptor(); + auto* selectedPD = getSelectedPrimitiveDescriptor(); if (!selectedPD) { return; @@ -1052,24 +1114,29 @@ void Convolution::initDescriptor(const NodeConfig& config) { // attr[0] for legacy post ops; // attr[1] is mostly for binaryPostops except when having per-tensor zp on AMX. const int descId = descIdx[selectedPrimitiveDescriptorIndex]; - int attrId = attrs.size() == 1 ? 0 : - descId % 2 == 0 ? 0 : 1; + int attrId = attrs.size() == 1 ? 0 : descId % 2 == 0 ? 0 : 1; preferLegacyPostOps = (attrId == 0 || (attrId == 1 && (inputZeroPointType == zpType::PerTensor) && - dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx))); - //attr[0] for legacy zero point. - //attr[1] for stock per-tensor zero point. + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx))); + // attr[0] for legacy zero point. + // attr[1] for stock per-tensor zero point. preferLegacyZeroPoint = (attrId == 0); DEBUG_LOG(getName(), - " selectedPrimitiveDescriptorIndex: ", selectedPrimitiveDescriptorIndex, - " DescIdx: ", descId, - " Selected impl type: ", selectedPD->getImplementationType(), - " Desc impl type: ", parse_impl_name(descs[descId].impl_info_str()), - " preferLegacyPostOps: ", preferLegacyPostOps, - " preferLegacyZeroPoint: ", preferLegacyZeroPoint); - - auto updateNodeConfig = [&](const NodeConfig& cfg){ + " selectedPrimitiveDescriptorIndex: ", + selectedPrimitiveDescriptorIndex, + " DescIdx: ", + descId, + " Selected impl type: ", + selectedPD->getImplementationType(), + " Desc impl type: ", + parse_impl_name(descs[descId].impl_info_str()), + " preferLegacyPostOps: ", + preferLegacyPostOps, + " preferLegacyZeroPoint: ", + preferLegacyZeroPoint); + + auto updateNodeConfig = [&](const NodeConfig& cfg) { auto updatedConfig = cfg; for (size_t i = 0; i < descInputNumbers(); i++) { @@ -1092,7 +1159,7 @@ void Convolution::initDescriptor(const NodeConfig& config) { return updatedConfig; }; - if (!canBeExecutedInInt8()) { // strided blobs are suppoted only for FP32 convolutions + if (!canBeExecutedInInt8()) { // strided blobs are suppoted only for FP32 convolutions descs.clear(); createDescriptor({config.inConfs[0].getMemDesc()}, {config.outConfs[0].getMemDesc()}); @@ -1110,7 +1177,7 @@ void Convolution::initDescriptor(const NodeConfig& config) { selectedPD->setConfig(updatedConfig); } -std::shared_ptr Convolution::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +std::shared_ptr Convolution::getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (idx == 1) { // report original plain layout for weight since it needs to be reordered dynamically at runtime return std::make_shared(getOriginalInputPrecisionAtPort(idx), @@ -1146,7 +1213,8 @@ ov::element::Type Convolution::getRuntimePrecision() const { for (size_t i = 0; i < std::min(getParentEdges().size(), inputsNumLimit); i++) { auto parentEdge = getParentEdgeAt(i); if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated) { - inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); + inputPrecisions.emplace_back( + DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); } } @@ -1178,8 +1246,9 @@ bool Convolution::isNspcAvailable() const { return false; } } else { - // it was empirically observed that the nspc convolutions perform much slower than the blocked ones if the channels number more than the specific value - size_t spatialRank = ndims - 2; //two means batch dim plus channels dim + // it was empirically observed that the nspc convolutions perform much slower than the blocked ones if the + // channels number more than the specific value + size_t spatialRank = ndims - 2; // two means batch dim plus channels dim bool is1x1 = false; @@ -1190,24 +1259,24 @@ bool Convolution::isNspcAvailable() const { auto paddingRreversItr = paddingR.crbegin(); for (size_t i = 0; i < spatialRank; ++i) { - is1x1 = true - && *(weightDimsReversItr++) == 1 - && *(strideReversItr++) == 1 - && *(paddingLreversItr++) == 0 - && *(paddingRreversItr++) == 0; + is1x1 = true && *(weightDimsReversItr++) == 1 && *(strideReversItr++) == 1 && + *(paddingLreversItr++) == 0 && *(paddingRreversItr++) == 0; } } - // if the activation field size is 1x1 the avx512 1x1 nspc convolution pollutes caches so that the layer after the convolution performs slow + // if the activation field size is 1x1 the avx512 1x1 nspc convolution pollutes caches so that the layer after + // the convolution performs slow if (mayiuse(impl::cpu::x64::avx512_core) && is1x1) { auto end = inpDims.rbegin(); std::advance(end, spatialRank); - if (std::all_of(inpDims.rbegin(), end, [](size_t x) { return dimsEqualStrong(1, x); })) { + if (std::all_of(inpDims.rbegin(), end, [](size_t x) { + return dimsEqualStrong(1, x); + })) { return false; } } - unsigned thresholdNumChannels = 128u; // for avx and below + unsigned thresholdNumChannels = 128u; // for avx and below if (is1x1) { thresholdNumChannels = 2048u; } else if (mayiuse(impl::cpu::x64::avx512_core)) { @@ -1219,7 +1288,8 @@ bool Convolution::isNspcAvailable() const { return false; } if (!mayiuse(impl::cpu::x64::avx)) { - // SSE41 nspc convolutions do not support ic and oc tails yet and the blocked implementation will be much better than gemm + // SSE41 nspc convolutions do not support ic and oc tails yet and the blocked implementation will be much + // better than gemm if ((IC % 8) || (OC % 8)) { return false; } @@ -1246,7 +1316,7 @@ void Convolution::prepareParams() { OPENVINO_THROW("Input memory is undefined."); } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) OPENVINO_THROW("Preferable primitive descriptor is not set for node ", getName(), "."); @@ -1319,44 +1389,41 @@ void Convolution::prepareParams() { dnnlBiasDesc = biasDescPtr->getDnnlDesc(); } - return createDescriptorInternal( - engine, - srcDesc, - wghDesc, - dnnlBiasDesc, - dstDesc, - (biasDescPtr != nullptr), - stride, - dilation, - paddingL, - paddingR, - alg, - attr); + return createDescriptorInternal(engine, + srcDesc, + wghDesc, + dnnlBiasDesc, + dstDesc, + (biasDescPtr != nullptr), + stride, + dilation, + paddingL, + paddingR, + alg, + attr); }; - dnnl::primitive_desc prim_desc = createDnnlConvDesc( - engine, - key.inp0->getDnnlDesc(), - wghDescAny, - key.out->getDnnlDesc(), - key.bias, - key.stride, - key.dilation, - key.paddingL, - key.paddingR, - convAlg, - key.attr); + dnnl::primitive_desc prim_desc = createDnnlConvDesc(engine, + key.inp0->getDnnlDesc(), + wghDescAny, + key.out->getDnnlDesc(), + key.bias, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + convAlg, + key.attr); const bool found = DnnlExtensionUtils::find_implementation(prim_desc, key.implType); if (found) { - return std::make_shared( - prim_desc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + return std::make_shared(prim_desc, + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); } // primitive desc with proper implementation type not found, use the first available @@ -1367,40 +1434,37 @@ void Convolution::prepareParams() { key.out->getDataType(), memory::format_tag::any); - auto reorderConvDesc = createDnnlConvDesc( - engine, - inDesc, - wghDescAny, - outDesc, - key.bias, - key.stride, - key.dilation, - key.paddingL, - key.paddingR, - convAlg, - key.attr); + auto reorderConvDesc = createDnnlConvDesc(engine, + inDesc, + wghDescAny, + outDesc, + key.bias, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + convAlg, + key.attr); // unable to create a primitive desc if (!reorderConvDesc) return nullptr; if (key.attr.get()->post_ops_.count(dnnl::impl::primitive_kind::sum)) { - return std::make_shared( - reorderConvDesc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + return std::make_shared(reorderConvDesc, + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); } - return std::make_shared( - reorderConvDesc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + return std::make_shared(reorderConvDesc, + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); }; auto prevExecPtr = execPtr; @@ -1455,7 +1519,8 @@ Convolution::ConvolutionExecutor::ConvolutionExecutor(const dnnl::primitive_desc const dnnl::memory::desc& weightMemDesc, const dnnl::memory::desc& outMemDesc, const dnnl::engine& engine, - bool constWeight) : DnnlExecutor(pd) { + bool constWeight) + : DnnlExecutor(pd) { if (inMemDesc != getDnnlSrcDesc()) { inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)}); } @@ -1475,7 +1540,8 @@ Convolution::ConvolutionSumExecutor::ConvolutionSumExecutor(const dnnl::primitiv const dnnl::memory::desc& weightMemDesc, const dnnl::memory::desc& outMemDesc, const dnnl::engine& engine, - bool constWeight) : DnnlExecutor(pd) { + bool constWeight) + : DnnlExecutor(pd) { if (inMemDesc != getDnnlSrcDesc()) { inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)}); } @@ -1493,9 +1559,10 @@ Convolution::ConvolutionSumExecutor::ConvolutionSumExecutor(const dnnl::primitiv } } -void Convolution::ConvolutionSumExecutor::reorder_exec(std::unordered_map primArgs, dnnl::stream strm) { +void Convolution::ConvolutionSumExecutor::reorder_exec(std::unordered_map primArgs, + dnnl::stream strm) { auto outputMem = primArgs.at(DNNL_ARG_DST); - for (auto &inReorder : inputReorders) { + for (auto& inReorder : inputReorders) { if (primArgs.count(inReorder.first)) { dnnl::memory memDst(inReorder.second.getDstDesc(), strm.get_engine()); inReorder.second.exec(primArgs[inReorder.first], memDst, strm); @@ -1544,14 +1611,14 @@ void Convolution::executeDynamicImpl(dnnl::stream strm) { } void Convolution::updatePadding() { - //update padding. + // update padding. if (isDynamicNode() && autoPadding) { paddingL = shapeInference->get_pads_begin(); paddingR = shapeInference->get_pads_end(); } } -void Convolution::redefineOutputMemory(const std::vector &newOutputShapes) { +void Convolution::redefineOutputMemory(const std::vector& newOutputShapes) { if (withSum) { const size_t sumPortNum = getParentEdges().size() - 1; const auto& sumInpMem = getParentEdgeAt(sumPortNum)->getMemory(); @@ -1565,7 +1632,8 @@ void Convolution::redefineOutputMemory(const std::vector &newOutputS auto inp1 = subgraph->getInput(1); inp1->redefineOutputMemory({sumInpMem.getStaticDims()}); - // here we postpone output memory reallocation due to the fact that it is the same memory with the sum second input + // here we postpone output memory reallocation due to the fact that it is the same memory with the sum + // second input return; } else { withSumBroadcast = false; @@ -1574,12 +1642,10 @@ void Convolution::redefineOutputMemory(const std::vector &newOutputS Node::redefineOutputMemory(newOutputShapes); } -MemoryDescPtr Convolution::getSumMemDesc(const primitive_desc &primitive_desc_it) { +MemoryDescPtr Convolution::getSumMemDesc(const primitive_desc& primitive_desc_it) { if (getOutputShapeAtPort(0).isDynamic()) { - // When we set input shape with ranged dims, sum node input shape maybe mismatch with output shape, we just change - // ranged min value to 1 to meet this case. - // For example: - // Output shape = {1, 160, {128, 256}, {128, 256}} + // When we set input shape with ranged dims, sum node input shape maybe mismatch with output shape, we just + // change ranged min value to 1 to meet this case. For example: Output shape = {1, 160, {128, 256}, {128, 256}} // Sum input shape = {1, 160, 1, 1} // Update sum shape to {1, 160, {1, 256}, {1, 256}} auto shape = getOutputShapeAtPort(0); @@ -1617,7 +1683,7 @@ MemoryPtr Convolution::getOutputMemory() const { } } -void Convolution::addFusedNode(const NodePtr &fusingNode) { +void Convolution::addFusedNode(const NodePtr& fusingNode) { if (Type::Eltwise == fusingNode->getType()) { if (fusingNode->getAlgorithm() == Algorithm::EltwiseAdd) { auto eltwiseNode = std::dynamic_pointer_cast(fusingNode); @@ -1650,7 +1716,6 @@ void Convolution::appendLegacyZeroPointsArgs() { } } - void Convolution::appendZeroPointsArgs() { if (stockInputZeroPointsMemPtr != nullptr) { primArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = stockInputZeroPointsMemPtr->getPrimitive(); @@ -1668,10 +1733,9 @@ void Convolution::initializeInputZeroPoints(const uint8_t* inputZpData, const si inputZeroPointType = zpType::PerChannel; } // Only enable per-tensor zero point on avx512-amx and avx512-core-vnni, avx2_vnni_2. - // avx2_vnni is not enabled per-tensor z because of perf regression brgconv with per-tensor zpcompared with jit per-channel zp - // If zero point is pertensor, both legacy zp and stock zp - // would be passed into conv node. The conv node would determine how to create - // post-ops attribute and prioritize to choose final onednn kernel. + // avx2_vnni is not enabled per-tensor z because of perf regression brgconv with per-tensor zpcompared with jit + // per-channel zp If zero point is pertensor, both legacy zp and stock zp would be passed into conv node. The conv + // node would determine how to create post-ops attribute and prioritize to choose final onednn kernel. if (inputZeroPointType == zpType::PerTensor && (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_vnni) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx2_vnni_2))) @@ -1689,15 +1753,14 @@ VectorDims Convolution::makeInputDummyShape(const Shape& inpShape) const { const size_t filterStartIndx = weightDims.size() - spatialRank; VectorDims dummyInputShapeVals(inpShape.getRank(), dummyInputDim); - dummyInputShapeVals[1] = IC; //channels + dummyInputShapeVals[1] = IC; // channels for (size_t i = 0; i < spatialRank; i++) { if (weightDims[filterStartIndx + i] > dummyInputShapeVals[2 + i]) { constexpr Dim dummyOutputDim = 16; - dummyInputShapeVals[2 + i] = (dummyOutputDim - 1) * stride[i] - - (paddingL[i] + paddingR[i]) + - weightDims[filterStartIndx + i] + - (weightDims[filterStartIndx + i]- 1) * (dilation[i]); + dummyInputShapeVals[2 + i] = (dummyOutputDim - 1) * stride[i] - (paddingL[i] + paddingR[i]) + + weightDims[filterStartIndx + i] + + (weightDims[filterStartIndx + i] - 1) * (dilation[i]); } } return MemoryDescUtils::makeDummyShape(inpShape, dummyInputShapeVals).getStaticDims(); @@ -1707,12 +1770,12 @@ VectorDims Convolution::outputStaticShape() const { auto& outputShape = getOutputShapeAtPort(0); if (outputShape.isDynamic()) { auto inpDummyShape = makeInputDummyShape(getInputShapeAtPort(0)); - auto outputDims = shapeInferGeneric({ Shape(inpDummyShape), Shape(weightDims) }); + auto outputDims = shapeInferGeneric({Shape(inpDummyShape), Shape(weightDims)}); return Shape(outputDims.front()).getStaticDims(); } return outputShape.getStaticDims(); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/conv.h b/src/plugins/intel_cpu/src/nodes/conv.h index a7cac9bced1241..8da3193e5760cf 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.h +++ b/src/plugins/intel_cpu/src/nodes/conv.h @@ -29,7 +29,7 @@ class Convolution : public Node { return false; } ov::element::Type getRuntimePrecision() const override; - std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; dnnl::memory getWeights() const; dnnl::memory getBias() const; @@ -39,23 +39,35 @@ class Convolution : public Node { } bool canBeExecutedInInt8() const override; - size_t getGroupNum() const { return groupNum; } - //OV Legacy input zero point mechanism can support per-channel zero point. - //Hold legacy input zero point. + size_t getGroupNum() const { + return groupNum; + } + // OV Legacy input zero point mechanism can support per-channel zero point. + // Hold legacy input zero point. std::vector legacyInputZeroPoints; - //Hold legacy weight zero point. + // Hold legacy weight zero point. std::vector legacyWeightsZeroPoints; - //Hold legacy pre-calculated output compensation + // Hold legacy pre-calculated output compensation std::vector legacyOutputCompensation; - //Hold stock per-tensor input zero point. Pass to onednn to calculate output compensation. + // Hold stock per-tensor input zero point. Pass to onednn to calculate output compensation. std::vector inputZeroPoints; void initializeInputZeroPoints(const uint8_t* inputZpData, const size_t inputZpSize); - const VectorDims &getWeightDims() { return weightDims; } - const std::vector &getStride() { return stride; } - const std::vector &getDilation() { return dilation; } - const std::vector &getPaddingL() { return paddingL; } - const std::vector &getPaddingR() { return paddingR; } + const VectorDims& getWeightDims() { + return weightDims; + } + const std::vector& getStride() { + return stride; + } + const std::vector& getDilation() { + return dilation; + } + const std::vector& getPaddingL() { + return paddingL; + } + const std::vector& getPaddingR() { + return paddingR; + } bool canFuse(const NodePtr& node) const override; bool isDepthWise() const { @@ -64,16 +76,12 @@ class Convolution : public Node { protected: ov::element::Type fusedEltwisePrecision(const NodePtr& fusingNode) const; - void redefineOutputMemory(const std::vector &newOutputShapes) override; - void addFusedNode(const NodePtr &fusingNode) override; + void redefineOutputMemory(const std::vector& newOutputShapes) override; + void addFusedNode(const NodePtr& fusingNode) override; const std::vector& getDefaultImplPriority() override; private: - enum class zpType { - None, - PerTensor, - PerChannel - }; + enum class zpType { None, PerTensor, PerChannel }; class FusedSubgraph; using FusedSubgraphPtr = std::shared_ptr; @@ -81,26 +89,26 @@ class Convolution : public Node { executorPtr execPtr = nullptr; class ConvolutionExecutor : public DnnlExecutor { - public: - ConvolutionExecutor(const dnnl::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight); + public: + ConvolutionExecutor(const dnnl::primitive_desc& pd, + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight); }; class ConvolutionSumExecutor : public DnnlExecutor { - public: - ConvolutionSumExecutor(const dnnl::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight); - - private: - void reorder_exec(std::unordered_map primArgs, dnnl::stream strm) override; + public: + ConvolutionSumExecutor(const dnnl::primitive_desc& pd, + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight); + + private: + void reorder_exec(std::unordered_map primArgs, dnnl::stream strm) override; }; void prepareParams() override; @@ -108,13 +116,16 @@ class Convolution : public Node { void executeDynamicImpl(dnnl::stream strm) override; void addLegacyZeroPoints(dnnl::primitive_attr& attr); void addZeroPoints(dnnl::primitive_attr& attr); - void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims, bool useLegacyPostOps, bool initWeights = false); - void SetPostOpsAndZeroPoints(std::vector &attrs); + void setPostOps(dnnl::primitive_attr& attr, + const VectorDims& dims, + bool useLegacyPostOps, + bool initWeights = false); + void SetPostOpsAndZeroPoints(std::vector& attrs); void filterSupportedDescriptors(); bool isNspcAvailable() const; void updatePadding(); - MemoryDescPtr getSumMemDesc(const dnnl::primitive_desc &primitive_desc_it); + MemoryDescPtr getSumMemDesc(const dnnl::primitive_desc& primitive_desc_it); MemoryPtr getOutputMemory() const; VectorDims makeInputDummyShape(const Shape& inpShape) const; VectorDims outputStaticShape() const; @@ -131,7 +142,7 @@ class Convolution : public Node { zpType inputZeroPointType = zpType::None; // maps each supportedPrimitiveDescriptor to corresponding desc from descs std::vector descIdx; - VectorDims expectedBiasDims {}; + VectorDims expectedBiasDims{}; std::vector stride; std::vector dilation; @@ -179,6 +190,6 @@ class Convolution : public Node { #endif }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/convert.cpp b/src/plugins/intel_cpu/src/nodes/convert.cpp index 1987c9cc83d5f2..d01a56aac1b86d 100644 --- a/src/plugins/intel_cpu/src/nodes/convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/convert.cpp @@ -26,7 +26,8 @@ bool Convert::isSupportedOperation(const std::shared_ptr& op, st auto srcPrc = op->get_input_element_type(0); auto dstPrc = op->get_output_element_type(0); if (!CommonConvertExecutor::isSupported(srcPrc, dstPrc)) { - errorMessage = "cpu_convert can't convert from: " + srcPrc.to_string() + " precision to: " + dstPrc.to_string(); + errorMessage = + "cpu_convert can't convert from: " + srcPrc.to_string() + " precision to: " + dstPrc.to_string(); return false; } } catch (...) { @@ -36,7 +37,7 @@ bool Convert::isSupportedOperation(const std::shared_ptr& op, st } Convert::Convert(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, PassThroughShapeInferFactory()) { + : Node(op, context, PassThroughShapeInferFactory()) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { errorPrefix = "Convert node with name '" + getName() + "'"; @@ -48,8 +49,11 @@ Convert::Convert(const std::shared_ptr& op, const GraphContext::CPtr c convertParams.origPrc = convert->get_destination_type(); } -Convert::Convert(const Shape &shape, const ov::element::Type &inPrc, const ov::element::Type &outPrc, - const std::string &nodeName, const GraphContext::CPtr context) +Convert::Convert(const Shape& shape, + const ov::element::Type& inPrc, + const ov::element::Type& outPrc, + const std::string& nodeName, + const GraphContext::CPtr context) : Node("Convert", {shape}, {shape}, {inPrc}, {outPrc}, nodeName, context) { convertParams.origPrc = outPrc; @@ -74,7 +78,7 @@ void Convert::getSupportedDescriptors() { OPENVINO_THROW(errorPrefix, " has incorrect number of output edges"); } -bool Convert::isSupportedDesc(const MemoryDesc &desc) { +bool Convert::isSupportedDesc(const MemoryDesc& desc) { bool isSupported = desc.getType() & MemoryDescType::Blocked; if (desc.getType() == MemoryDescType::DnnlBlocked) isSupported &= desc.as()->hasEmptyExtraData(); @@ -101,13 +105,16 @@ void Convert::initSupportedPrimitiveDescriptors() { MemoryDescPtr dstMemoryDesc = config.outConfs[0].getMemDesc(); convertParams.srcPrc = srcMemoryDesc->getPrecision(); convertParams.dstPrc = dstMemoryDesc->getPrecision(); - auto factory = std::make_shared(convertParams, srcMemoryDesc, dstMemoryDesc, - std::make_shared(context, getImplPriority())); + auto factory = + std::make_shared(convertParams, + srcMemoryDesc, + dstMemoryDesc, + std::make_shared(context, getImplPriority())); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, factory); }; - // if input and output pointers are not null and not contain extra data, then the inp/output tensor descriptors were set using setDescs method, so - // they should be used as the actual descriptors. + // if input and output pointers are not null and not contain extra data, then the inp/output tensor descriptors were + // set using setDescs method, so they should be used as the actual descriptors. if (canInitExternalDesc) { dataIn.setMemDesc(input); config.inConfs.push_back(dataIn); @@ -142,8 +149,10 @@ void Convert::initSupportedPrimitiveDescriptors() { : BlockedDescCreator::makeFilteredRange(creators, insShape.getRank()); for (auto itr = range.first; itr != range.second; ++itr) { - config.inConfs[0].setMemDesc(std::make_shared(itr->second->createDesc(insPrecision, insShape))); - config.outConfs[0].setMemDesc(std::make_shared(itr->second->createDesc(outPrecision, outputShape))); + config.inConfs[0].setMemDesc( + std::make_shared(itr->second->createDesc(insPrecision, insShape))); + config.outConfs[0].setMemDesc( + std::make_shared(itr->second->createDesc(outPrecision, outputShape))); supportedPrimitiveDescriptorsBuilder(config); } @@ -159,10 +168,8 @@ void Convert::prepareParams() { auto selectedPD = getSelectedPrimitiveDescriptor(); MemoryDescPtr srcDesc = getSrcMemoryAtPort(0)->getDescPtr(); MemoryDescPtr dstDesc = getDstMemoryAtPort(0)->getDescPtr(); - execPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(convertParams, - srcDesc, - dstDesc, - {}); + execPtr = + selectedPD->getExecutorFactoryAs()->makeExecutor(convertParams, srcDesc, dstDesc, {}); selectedPD->setImplementationType(execPtr->implType()); } @@ -189,6 +196,6 @@ bool Convert::created() const { return getType() == Type::Convert; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/convert.h b/src/plugins/intel_cpu/src/nodes/convert.h index 2a257bd1d31cd8..3bc911d118fd7a 100644 --- a/src/plugins/intel_cpu/src/nodes/convert.h +++ b/src/plugins/intel_cpu/src/nodes/convert.h @@ -14,8 +14,11 @@ namespace node { class Convert : public Node { public: Convert(const std::shared_ptr& op, const GraphContext::CPtr context); - Convert(const Shape &shape, const ov::element::Type &inPrc, const ov::element::Type &outPrc, - const std::string &nodeName, const GraphContext::CPtr context); + Convert(const Shape& shape, + const ov::element::Type& inPrc, + const ov::element::Type& outPrc, + const std::string& nodeName, + const GraphContext::CPtr context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -28,22 +31,28 @@ class Convert : public Node { } // This is the interface extension designed to provide inp and output tensor descriptors without the CNNLayer. - // In that case the Convert node is instantiated with default CNNLayer and inp/out tensor descriptors are set via this method. - // This is useful if the Convert node is added to the graph as an auxiliary operation at the Graph + // In that case the Convert node is instantiated with default CNNLayer and inp/out tensor descriptors are set via + // this method. This is useful if the Convert node is added to the graph as an auxiliary operation at the Graph // initialization stage. void setDescs(const MemoryDesc& input, const MemoryDesc& output) { this->input = input.clone(); this->output = output.clone(); } - const MemoryDesc& getInput() const { return *input; } - const MemoryDesc& getOutput() const { return *output; } + const MemoryDesc& getInput() const { + return *input; + } + const MemoryDesc& getOutput() const { + return *output; + } - bool needPrepareParams() const override { return inputShapesModified(); } + bool needPrepareParams() const override { + return inputShapesModified(); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - static bool isSupportedDesc(const MemoryDesc &desc); + static bool isSupportedDesc(const MemoryDesc& desc); private: MemoryDescPtr input; @@ -55,6 +64,6 @@ class Convert : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp index 0b467fe452e061..2869d782cdb445 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp @@ -2,18 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/ctc_greedy_decoder.hpp" + #include #include -#include "openvino/op/ctc_greedy_decoder.hpp" -#include "openvino/core/parallel.hpp" #include "ctc_greedy_decoder.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { namespace node { -bool CTCGreedyDecoder::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool CTCGreedyDecoder::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto greedyDecOp = ov::as_type_ptr(op); if (!greedyDecOp) { @@ -61,8 +63,7 @@ void CTCGreedyDecoder::initSupportedPrimitiveDescriptors() { if (!one_of(seqLenPrecision, ov::element::f32, ov::element::bf16, ov::element::f16)) OPENVINO_THROW(errorPrefix, "has unsupported 'sequence_length' input precision: ", seqLenPrecision); - addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -141,7 +142,7 @@ void CTCGreedyDecoder::execute(dnnl::stream strm) { } tStart = 0lu; } - }; // thread body + }; // thread body parallel_nt(0, threadBody); @@ -151,8 +152,7 @@ void CTCGreedyDecoder::execute(dnnl::stream strm) { const size_t sequenceLength = sequenceLengths[b]; float* shiftedOut = outputSequences + b * T; for (size_t t = 0; t < sequenceLength; ++t) { - if (*shiftedOut < blankIndex && - !(mergeRepeated && *shiftedOut == prevClassIdx)) { + if (*shiftedOut < blankIndex && !(mergeRepeated && *shiftedOut == prevClassIdx)) { outputSequences[outputIndex++] = *shiftedOut; } prevClassIdx = *shiftedOut; @@ -174,6 +174,6 @@ bool CTCGreedyDecoder::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h index 1f3179edb904d2..a552ff7db3c566 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h @@ -14,7 +14,7 @@ class CTCGreedyDecoder : public Node { public: CTCGreedyDecoder(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -22,6 +22,7 @@ class CTCGreedyDecoder : public Node { bool needPrepareParams() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + private: const size_t DATA_INDEX = 0lu; const size_t SEQUENCE_LENGTH_INDEX = 1lu; @@ -30,6 +31,6 @@ class CTCGreedyDecoder : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp index 63db3968094c3a..3eb02f2583e551 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp @@ -2,18 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "ctc_greedy_decoder_seq_len.h" + +#include #include #include -#include #include "openvino/core/parallel.hpp" -#include "ctc_greedy_decoder_seq_len.h" namespace ov { namespace intel_cpu { namespace node { -bool CTCGreedyDecoderSeqLen::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool CTCGreedyDecoderSeqLen::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto greedyDecOp = ov::as_type_ptr(op); if (!greedyDecOp) { @@ -67,33 +69,35 @@ void CTCGreedyDecoderSeqLen::initSupportedPrimitiveDescriptors() { inDataConf.emplace_back(LayoutType::ncsp, ov::element::i32); addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::i32}, - {LayoutType::ncsp, ov::element::i32}}, + {{LayoutType::ncsp, ov::element::i32}, {LayoutType::ncsp, ov::element::i32}}, impl_desc_type::ref_any); } void CTCGreedyDecoderSeqLen::execute(dnnl::stream strm) { const float* probabilities = getSrcDataAtPortAs(DATA_INDEX); const int* sequenceLengths = getSrcDataAtPortAs(SEQUENCE_LENGTH_INDEX); - int* decodedClasses = getDstDataAtPortAs(DECODED_CLASSES_INDEX); + int* decodedClasses = getDstDataAtPortAs(DECODED_CLASSES_INDEX); int* decodedClassesLength = getDstDataAtPortAs(DECODED_CLASSES_LENGTH_INDEX); - const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0];; - const size_t T = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[1];; - const int C = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[2];; + const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0]; + ; + const size_t T = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[1]; + ; + const int C = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[2]; + ; const size_t TC = T * C; int blankIndex = C - 1; if (inputShapes.size() > BLANK_INDEX) - blankIndex = (getSrcDataAtPortAs(BLANK_INDEX))[0]; + blankIndex = (getSrcDataAtPortAs(BLANK_INDEX))[0]; size_t workAmount = 0; for (size_t b = 0; b < B; b++) { if (sequenceLengths[b] > static_cast(T)) { - std::string errorMsg = errorPrefix - + ". Sequence length " + std::to_string(sequenceLengths[b]) - + " cannot be greater than according decoded classes dimension size " - + std::to_string(getChildEdgeAt(DECODED_CLASSES_INDEX)->getMemory().getStaticDims()[1]); + std::string errorMsg = + errorPrefix + ". Sequence length " + std::to_string(sequenceLengths[b]) + + " cannot be greater than according decoded classes dimension size " + + std::to_string(getChildEdgeAt(DECODED_CLASSES_INDEX)->getMemory().getStaticDims()[1]); OPENVINO_THROW(errorMsg); } workAmount += sequenceLengths[b]; @@ -142,7 +146,7 @@ void CTCGreedyDecoderSeqLen::execute(dnnl::stream strm) { } tStart = 0lu; } - }; // thread body + }; // thread body parallel_nt(0, threadBody); @@ -153,8 +157,7 @@ void CTCGreedyDecoderSeqLen::execute(dnnl::stream strm) { int* shiftedOut = decodedClasses + b * T; for (size_t t = 0; t < actualSeqLen; ++t) { - if (*shiftedOut != blankIndex && - !(mergeRepeated && *shiftedOut == prevClassIdx)) { + if (*shiftedOut != blankIndex && !(mergeRepeated && *shiftedOut == prevClassIdx)) { decodedClasses[outputIndex++] = *shiftedOut; } prevClassIdx = *shiftedOut; @@ -177,6 +180,6 @@ bool CTCGreedyDecoderSeqLen::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h index 4e7d14fd23556a..95ab8ef84b07eb 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h @@ -14,7 +14,7 @@ class CTCGreedyDecoderSeqLen : public Node { public: CTCGreedyDecoderSeqLen(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -34,6 +34,6 @@ class CTCGreedyDecoderSeqLen : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp index 6d09b0aea7e934..0ed3d95503eb62 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp @@ -2,11 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/ctc_loss.hpp" + #include -#include "openvino/op/ctc_loss.hpp" -#include "openvino/core/parallel.hpp" #include "ctc_loss.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -53,9 +54,7 @@ void CTCLoss::initSupportedPrimitiveDescriptors() { for (size_t i = 1; i < inputShapes.size(); ++i) inDataConf.emplace_back(LayoutType::ncsp, ov::element::i32); - addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::f32}}, - impl_desc_type::ref_any); + addSupportedPrimDesc(inDataConf, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } void CTCLoss::executeDynamicImpl(dnnl::stream strm) { @@ -71,7 +70,7 @@ void CTCLoss::execute(dnnl::stream strm) { const int* labelsLength = getSrcDataAtPortAs(3); float* dstData = getDstDataAtPortAs(0); - const auto &inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); + const auto& inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); const size_t batchNum = inDims[0]; const size_t maxTime = inDims[1]; const size_t classesNum = inDims[2]; @@ -96,11 +95,11 @@ void CTCLoss::execute(dnnl::stream strm) { for (size_t b = start; b < end; b++) { if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > static_cast(maxTime) || labelsLength[b] > logitsLength[b]) { - errorMsgB[ithr] = errorPrefix + ". Logit length cannot be greater than max sequence length. " - + "Label length cannot be greater than a logit length" - + " and both cannot be negative.\nMaxSeqLen: " - + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b]) - + "; Label len: " + std::to_string(labelsLength[b]); + errorMsgB[ithr] = errorPrefix + ". Logit length cannot be greater than max sequence length. " + + "Label length cannot be greater than a logit length" + + " and both cannot be negative.\nMaxSeqLen: " + std::to_string(maxTime) + + "; Logit len: " + std::to_string(logitsLength[b]) + + "; Label len: " + std::to_string(labelsLength[b]); returnCode = -1; return; } @@ -151,8 +150,8 @@ void CTCLoss::execute(dnnl::stream strm) { for (size_t ll = 0; ll < actualLogitLen; ll++) { logProbabilities[ll].resize(decodedTargetLen); } - } // for batch - }; // threadBody_1 + } // for batch + }; // threadBody_1 parallel_nt(threads_num, threadBody_1); if (returnCode != 0) { @@ -211,7 +210,7 @@ void CTCLoss::execute(dnnl::stream strm) { } sT = 0lu; } // for batch - }; // threadBody_2 + }; // threadBody_2 parallel_nt(0, threadBody_2); @@ -236,8 +235,8 @@ void CTCLoss::execute(dnnl::stream strm) { if (start >= end) return; - // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: - // Graves et al., 2016, paragraph 4.1 (10) + // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural + // Networks: Graves et al., 2016, paragraph 4.1 (10) for (size_t b = start; b < end; b++) { auto& targetD = targetDB[b]; auto& logProbabilities = logProbabilitiesB[b]; @@ -250,21 +249,19 @@ void CTCLoss::execute(dnnl::stream strm) { for (int t = actualLogitLen - 2; t >= 0; t--) { const int t_1 = t + 1; for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t))); - s < std::min(decodedTargetLen, 2 * (t_1)); s++) { + s < std::min(decodedTargetLen, 2 * (t_1)); + s++) { if (ctcMergeRepeated || targetD[s] == blankIndex) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s][t_1] + logProbabilities[t_1][s]); + logBwd[s][t] = sumLogs(logBwd[s][t], logBwd[s][t_1] + logProbabilities[t_1][s]); } if (s + 1 < decodedTargetLen) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); + logBwd[s][t] = sumLogs(logBwd[s][t], logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); } if (s + 2 < decodedTargetLen) { if (targetD[s] != blankIndex && (!ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); + logBwd[s][t] = sumLogs(logBwd[s][t], logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); } } } @@ -274,8 +271,8 @@ void CTCLoss::execute(dnnl::stream strm) { logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0]; dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]); - } // for batch - }; // threadBody_3 + } // for batch + }; // threadBody_3 parallel_nt(0, threadBody_3); } @@ -284,6 +281,6 @@ bool CTCLoss::created() const { return getType() == Type::CTCLoss; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_loss.h b/src/plugins/intel_cpu/src/nodes/ctc_loss.h index a07d8f0fc59479..d1a66df3b92b89 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_loss.h +++ b/src/plugins/intel_cpu/src/nodes/ctc_loss.h @@ -14,7 +14,7 @@ class CTCLoss : public Node { public: CTCLoss(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -22,7 +22,9 @@ class CTCLoss : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void executeDynamicImpl(dnnl::stream strm) override; - bool needPrepareParams() const override { return false; }; + bool needPrepareParams() const override { + return false; + }; private: bool ctcMergeRepeated; @@ -32,6 +34,6 @@ class CTCLoss : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/cum_sum.cpp b/src/plugins/intel_cpu/src/nodes/cum_sum.cpp index e411283e661585..43e69e29916430 100644 --- a/src/plugins/intel_cpu/src/nodes/cum_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/cum_sum.cpp @@ -3,15 +3,16 @@ // #include "cum_sum.h" + +#include +#include + #include "openvino/core/parallel.hpp" #include "openvino/core/type/float16.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/opsets/opset3.hpp" #include "utils/bfloat16.hpp" -#include -#include - namespace ov { namespace intel_cpu { namespace node { @@ -38,10 +39,11 @@ CumSum::CumSum(const std::shared_ptr& op, const GraphContext::CPtr con errorPrefix = "CumSum layer with name '" + op->get_friendly_name() + "' "; - if ((getOriginalInputsNumber() != numOfInputs && getOriginalInputsNumber() != (numOfInputs - 1)) || getOriginalOutputsNumber() != 1) + if ((getOriginalInputsNumber() != numOfInputs && getOriginalInputsNumber() != (numOfInputs - 1)) || + getOriginalOutputsNumber() != 1) OPENVINO_THROW(errorPrefix, " has incorrect number of input/output edges!"); - const auto &dataShape = getInputShapeAtPort(CUM_SUM_DATA); + const auto& dataShape = getInputShapeAtPort(CUM_SUM_DATA); numOfDims = dataShape.getRank(); if (numOfDims < 1) { OPENVINO_THROW(errorPrefix, " doesn't support 'data' input tensor with rank: ", numOfDims); @@ -70,13 +72,19 @@ void CumSum::initSupportedPrimitiveDescriptors() { dataPrecision = getOriginalInputPrecisionAtPort(CUM_SUM_DATA); if (!one_of(dataPrecision, - ov::element::i8, ov::element::u8, - ov::element::i16, ov::element::i32, ov::element::i64, ov::element::u64, - ov::element::bf16, ov::element::f16, ov::element::f32)) + ov::element::i8, + ov::element::u8, + ov::element::i16, + ov::element::i32, + ov::element::i64, + ov::element::u64, + ov::element::bf16, + ov::element::f16, + ov::element::f32)) OPENVINO_THROW(errorPrefix, " has unsupported 'data' input precision: ", dataPrecision.get_type_name()); if (inputShapes.size() == numOfInputs) { - const auto &axisTensorPrec = getOriginalInputPrecisionAtPort(AXIS); + const auto& axisTensorPrec = getOriginalInputPrecisionAtPort(AXIS); if (axisTensorPrec != ov::element::i32 && axisTensorPrec != ov::element::i64) OPENVINO_THROW(errorPrefix, " has unsupported 'axis' input precision: ", axisTensorPrec.get_type_name()); } @@ -87,16 +95,17 @@ void CumSum::initSupportedPrimitiveDescriptors() { for (size_t i = 1; i < inputShapes.size(); ++i) inDataConf.emplace_back(LayoutType::ncsp, ov::element::i32); - addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, dataPrecision}}, - impl_desc_type::ref_any); + addSupportedPrimDesc(inDataConf, {{LayoutType::ncsp, dataPrecision}}, impl_desc_type::ref_any); } void CumSum::execute(dnnl::stream strm) { if (inputShapes.size() == numOfInputs) axis = getAxis(getParentEdgeAt(AXIS)->getMemory(), getParentEdgeAt(CUM_SUM_DATA)->getMemory()); - OV_SWITCH(intel_cpu, CumSumExecute, this, dataPrecision, + OV_SWITCH(intel_cpu, + CumSumExecute, + this, + dataPrecision, OV_CASE(ov::element::i8, int8_t), OV_CASE(ov::element::u8, uint8_t), OV_CASE(ov::element::i16, int16_t), @@ -110,9 +119,10 @@ void CumSum::execute(dnnl::stream strm) { template void CumSum::exec() { - const auto *input = getSrcDataAtPortAs(CUM_SUM_DATA); - auto *output = getDstDataAtPortAs(0); - const VectorDims strides = getParentEdgeAt(CUM_SUM_DATA)->getMemory().getDescWithType()->getStrides(); + const auto* input = getSrcDataAtPortAs(CUM_SUM_DATA); + auto* output = getDstDataAtPortAs(0); + const VectorDims strides = + getParentEdgeAt(CUM_SUM_DATA)->getMemory().getDescWithType()->getStrides(); if (reverse) { if (exclusive) { @@ -130,16 +140,17 @@ void CumSum::exec() { } template -void CumSum::cumSum(const dataType *input, dataType *output, const VectorDims &strides) { +void CumSum::cumSum(const dataType* input, dataType* output, const VectorDims& strides) { VectorDims iterationRange(numOfDims - 1); size_t j = 0; - const auto &shape = getParentEdgeAt(CUM_SUM_DATA)->getMemory().getStaticDims(); + const auto& shape = getParentEdgeAt(CUM_SUM_DATA)->getMemory().getStaticDims(); for (size_t i = 0; i < shape.size(); i++) { if (i == axis) continue; iterationRange[j++] = shape[i]; } - size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), size_t(1), std::multiplies()); + size_t work_amount_dst = + std::accumulate(iterationRange.begin(), iterationRange.end(), size_t(1), std::multiplies()); parallel_nt(0, [&](const int ithr, const int nthr) { size_t start = 0, end = 0; VectorDims counters(numOfDims - 1, 0); @@ -159,32 +170,32 @@ void CumSum::cumSum(const dataType *input, dataType *output, const VectorDims &s size_t startOffset = getStartOffset(forStartOffset, strides); - const dataType *inputStart = input + startOffset; - dataType *outputStart = output + startOffset; + const dataType* inputStart = input + startOffset; + dataType* outputStart = output + startOffset; size_t offset = strides[axis]; if (reverse) { if (exclusive) { - outputStart[offset*(shape[axis] - 1)] = 0; + outputStart[offset * (shape[axis] - 1)] = 0; for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset]; + outputStart[i * offset] = inputStart[(i + 1) * offset] + outputStart[(i + 1) * offset]; } } else { - outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; + outputStart[offset * (shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset]; + outputStart[i * offset] = inputStart[i * offset] + outputStart[(i + 1) * offset]; } } } else { if (exclusive) { outputStart[0] = 0; for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset]; + outputStart[i * offset] = inputStart[(i - 1) * offset] + outputStart[(i - 1) * offset]; } } else { outputStart[0] = inputStart[0]; for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset]; + outputStart[i * offset] = inputStart[i * offset] + outputStart[(i - 1) * offset]; } } } @@ -219,7 +230,8 @@ inline void CumSum::parallelItStep(std::vector& counters, const std::vec } } -inline size_t CumSum::getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const { +inline size_t CumSum::getStartOffset(const std::vector& forStartOffset, + const std::vector& strides) const { size_t startOffset = 0; for (size_t idx = 0; idx < forStartOffset.size(); ++idx) { startOffset += forStartOffset[idx] * strides[idx]; @@ -232,19 +244,19 @@ size_t CumSum::getAxis(const IMemory& _axis, const IMemory& _data) const { const int64_t dataShapeSize = static_cast(_data.getShape().getRank()); int64_t axisValueFromBlob = 0; switch (axisPrecision) { - case ov::element::i32 : { - const auto *axisPtr = _axis.getDataAs(); - axisValueFromBlob = static_cast(axisPtr[0]); - break; - } - case ov::element::i64 : { - const auto *axisPtr = _axis.getDataAs(); - axisValueFromBlob = axisPtr[0]; - break; - } - default : { - OPENVINO_THROW(errorPrefix, " doesn't support 'axis' input with precision: ", axisPrecision.get_type_name()); - } + case ov::element::i32: { + const auto* axisPtr = _axis.getDataAs(); + axisValueFromBlob = static_cast(axisPtr[0]); + break; + } + case ov::element::i64: { + const auto* axisPtr = _axis.getDataAs(); + axisValueFromBlob = axisPtr[0]; + break; + } + default: { + OPENVINO_THROW(errorPrefix, " doesn't support 'axis' input with precision: ", axisPrecision.get_type_name()); + } } if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1) OPENVINO_THROW(errorPrefix, " has axis with a value out of range: ", axisValueFromBlob); @@ -263,6 +275,6 @@ void CumSum::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/cum_sum.h b/src/plugins/intel_cpu/src/nodes/cum_sum.h index b0aad351d55f93..139c7205e81fcc 100644 --- a/src/plugins/intel_cpu/src/nodes/cum_sum.h +++ b/src/plugins/intel_cpu/src/nodes/cum_sum.h @@ -14,7 +14,7 @@ class CumSum : public Node { public: CumSum(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -29,13 +29,13 @@ class CumSum : public Node { void exec(); template - void cumSum(const dataType *input, dataType *output, const std::vector &strides); + void cumSum(const dataType* input, dataType* output, const std::vector& strides); void parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange); inline void parallelItStep(std::vector& counters, const std::vector& iterationRange); - inline size_t getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const; + inline size_t getStartOffset(const std::vector& forStartOffset, const std::vector& strides) const; size_t getAxis(const IMemory& _axis, const IMemory& _data) const; @@ -48,7 +48,7 @@ class CumSum : public Node { ov::element::Type dataPrecision; std::string errorPrefix; - template + template struct CumSumExecute { void operator()(CumSum* node) { node->exec(); @@ -56,6 +56,6 @@ class CumSum : public Node { }; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index 2ee858e730c900..f30e3481afbb3d 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -4,16 +4,15 @@ #include "deconv.h" -#include "dnnl_extension_utils.h" #include #include -#include "common/primitive_hashing_utils.hpp" #include #include -#include "cpu/x64/cpu_isa_traits.hpp" -#include "shape_inference/shape_inference_ngraph.hpp" +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/cpu_isa_traits.hpp" +#include "dnnl_extension_utils.h" #include "eltwise.h" #include "fake_quantize.h" #include "input.h" @@ -21,16 +20,16 @@ #include "openvino/core/parallel.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/runtime/make_tensor.hpp" -#include "utils/general_utils.h" +#include "shape_inference/shape_inference.hpp" #include "utils/cpu_utils.hpp" +#include "utils/general_utils.h" #if defined(OV_CPU_WITH_ACL) -#include "executors/acl/acl_utils.hpp" -#include "utils/debug_capabilities.h" +# include "executors/acl/acl_utils.hpp" +# include "utils/debug_capabilities.h" #endif #include - #include #include @@ -40,8 +39,8 @@ namespace ov { namespace intel_cpu { namespace node { -using DefaultDeconvDescs = std::pair; +using DefaultDeconvDescs = + std::pair; using Int8DeconvDesc = dnnl::deconvolution_forward::primitive_desc; namespace { @@ -92,7 +91,7 @@ size_t DeconvKey::hash() const { return seed; } -bool DeconvKey::operator==(const DeconvKey &rhs) const { +bool DeconvKey::operator==(const DeconvKey& rhs) const { bool retVal = true; if (inp0 != rhs.inp0) { retVal = retVal && inp0 && rhs.inp0 && inp0->getDnnlDesc() == rhs.inp0->getDnnlDesc(); @@ -122,29 +121,31 @@ bool DeconvKey::operator==(const DeconvKey &rhs) const { } /** - * Deconvolution shape inference factory. It defines the input mask depending on the existence of the `output_shape` input. - * Since in case it exists, plugin should pass the input data to the shape inference function. + * Deconvolution shape inference factory. It defines the input mask depending on the existence of the `output_shape` + * input. Since in case it exists, plugin should pass the input data to the shape inference function. * */ class DeconfolutionShapeInferFactory : public ShapeInferFactory { public: - DeconfolutionShapeInferFactory(std::shared_ptr op) : m_op(op) {} + DeconfolutionShapeInferFactory(std::shared_ptr op) : m_op(std::move(op)) {} + ShapeInferPtr makeShapeInfer() const override { - if (m_op->get_input_size() > 2) { - return std::make_shared(make_shape_inference(m_op), PortMask(2)); - } - return std::make_shared(make_shape_inference(m_op), EMPTY_PORT_MASK); + const auto port_mask = (m_op->get_input_size() > 2) ? PortMask(2) : EMPTY_PORT_MASK; + return make_shape_inference(m_op, port_mask); } + private: std::shared_ptr m_op; }; -} // namespace +} // namespace -bool Deconvolution::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Deconvolution::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (std::dynamic_pointer_cast(op) == nullptr && - std::dynamic_pointer_cast(op) == nullptr) { - errorMessage = "Only opset1 ConvolutionBackpropData and GroupConvolutionBackpropData operations are supported"; + std::dynamic_pointer_cast(op) == nullptr) { + errorMessage = + "Only opset1 ConvolutionBackpropData and GroupConvolutionBackpropData operations are supported"; return false; } size_t ndims = op->get_input_partial_shape(0).rank().get_length(); @@ -152,7 +153,8 @@ bool Deconvolution::isSupportedOperation(const std::shared_ptr& errorMessage = "Only 3D, 4D and 5D blobs are supported as input"; return false; } - if (op->get_input_partial_shape(1).is_dynamic() || (op->get_input_size() > 2 && op->get_input_partial_shape(2).is_dynamic())) { + if (op->get_input_partial_shape(1).is_dynamic() || + (op->get_input_size() > 2 && op->get_input_partial_shape(2).is_dynamic())) { errorMessage = "Doesn't support dynamic shapes for 'weights' and 'output_shape' inputs"; return false; } @@ -162,8 +164,8 @@ bool Deconvolution::isSupportedOperation(const std::shared_ptr& return true; } -Deconvolution::Deconvolution(const std::shared_ptr& op, - const GraphContext::CPtr context) : Node(op, context, DeconfolutionShapeInferFactory(op)) { +Deconvolution::Deconvolution(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, DeconfolutionShapeInferFactory(op)) { std::string errorMessage; errorPrefix = "Deconvolution node with name '" + getName() + "' "; if (!isSupportedOperation(op, errorMessage)) @@ -176,7 +178,7 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, IC = weightDims[0]; OC = weightDims[1]; - expectedBiasDims = {OC}; + expectedBiasDims = {OC}; groupNum = 1; withGroups = false; @@ -199,7 +201,7 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, groupNum = weightDims[0]; IC = groupNum * weightDims[1]; OC = groupNum * weightDims[2]; - expectedBiasDims = {OC}; + expectedBiasDims = {OC}; withGroups = groupNum > 1; isDW = withGroups && groupNum == OC && groupNum == IC; @@ -229,8 +231,11 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, lastOutputSpatialDims = ov::as_type(op->get_input_node_ptr(2))->cast_vector(); if (externOutShape && isDynamicNode()) { const auto spDimsNum = getInputShapeAtPort(0).getRank() - 2; - if (getInputShapeAtPort(2).getStaticDims()[0] != spDimsNum || (isConstOutShape && lastOutputSpatialDims.size() != spDimsNum)) { - OPENVINO_THROW(errorPrefix, "'output_shape' input has incorrect number of elements. Expected = ", spDimsNum); + if (getInputShapeAtPort(2).getStaticDims()[0] != spDimsNum || + (isConstOutShape && lastOutputSpatialDims.size() != spDimsNum)) { + OPENVINO_THROW(errorPrefix, + "'output_shape' input has incorrect number of elements. Expected = ", + spDimsNum); } } @@ -240,8 +245,10 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, for (size_t i = 0; i < spatialRank; ++i) is1x1 = is1x1 && *(weightDimsReversItr++) == 1; // 1x1 deconv has some test case failed. The cause is upstream ONEDNN unsupported brgemm implementation cases are - // enabled in forked ONEDNNN https://github.com/openvinotoolkit/oneDNN/blob/117e287000b48a34a7218fcaa274a91571141728/src/common/convolution.cpp#L138. - // Some test cases on 1x1 kernel failed on accuracy check, current WA is disabling brgemm deconv implementation for such cases. + // enabled in forked ONEDNNN + // https://github.com/openvinotoolkit/oneDNN/blob/117e287000b48a34a7218fcaa274a91571141728/src/common/convolution.cpp#L138. + // Some test cases on 1x1 kernel failed on accuracy check, current WA is disabling brgemm deconv implementation for + // such cases. if (is1x1 && deconvAttrs.paddingL != deconvAttrs.paddingR) { // case1: Specify asymmetric padding explicitly asymmetricPaddingAnd1x1 = true; @@ -290,7 +297,9 @@ bool Deconvolution::canBeExecutedInInt8() const { return false; if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core)) { const auto& inMaxDims = getOutputShapeAtPort(0).getMaxDims(); - if (std::any_of(inMaxDims.begin(), inMaxDims.end(), [](Dim dim) { return dim == Shape::UNDEFINED_DIM; })) { + if (std::any_of(inMaxDims.begin(), inMaxDims.end(), [](Dim dim) { + return dim == Shape::UNDEFINED_DIM; + })) { return false; } // heuristicConst = 2^26 @@ -310,7 +319,8 @@ bool Deconvolution::canBeExecutedInInt8() const { // not supported in oneDNN int channelBlock = impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core) ? 16 - : impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) ? 8 : 4; + : impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) ? 8 + : 4; if (withGroups && !isDW && (IC % channelBlock != 0 || OC % channelBlock != 0)) return false; if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core) && deconvAttrs.stride.back() > 3) @@ -331,16 +341,18 @@ bool Deconvolution::canBeExecutedInInt8() const { bool Deconvolution::canFuse(const NodePtr& node) const { if (canBeExecutedInInt8()) return canFuseSimpleOperation(node); - // Upstream ONEDNN conv_backward_data primitive can't support any post-ops, fork onednn added depthwise support in conv_backward_data JIT implementation. - // ONEDNN deconv primitive can support most of post-ops, but the post-ops implementation details are different. - // So current deconv implementation list in onednn has 2 kinds of implements: + // Upstream ONEDNN conv_backward_data primitive can't support any post-ops, fork onednn added depthwise support in + // conv_backward_data JIT implementation. ONEDNN deconv primitive can support most of post-ops, but the post-ops + // implementation details are different. So current deconv implementation list in onednn has 2 kinds of implements: // 1. deconv implementation with JIT post-ops supported in the kernel (such as brgdeconv) - // 2. forked conv_data_backwards implementation with JIT depthwise post-ops + reference implementation for other post ops. - // Considering that some deconv fallback on the JIT implementation, we limit the post ops fusing to avoid regressions. - // Regression with stylegan2 int8 model pattern: - // none-quantzied deconv(with none-const weight) + FQ pattern fall back on JIT because of onednn limitation. (fall back ticket MFDNN-11577). - // If FQ is fused, it runs with the ref post-ops implementation. - // @todo: if onednn can ensure all the deconv run with the brgemm implementation, we can unify the fuse criteria between int8 and fp32 use cases. + // 2. forked conv_data_backwards implementation with JIT depthwise post-ops + reference implementation for other + // post ops. + // Considering that some deconv fallback on the JIT implementation, we limit the post ops fusing to avoid + // regressions. Regression with stylegan2 int8 model pattern: none-quantzied deconv(with none-const weight) + FQ + // pattern fall back on JIT because of onednn limitation. (fall back ticket MFDNN-11577). If FQ is fused, it runs + // with the ref post-ops implementation. + // @todo: if onednn can ensure all the deconv run with the brgemm implementation, we can unify the fuse criteria + // between int8 and fp32 use cases. return (fusedWith.empty() && node->canBePerformedAsScaleShift(this)); } @@ -361,8 +373,10 @@ std::pair Deconvolution::makeDummyInOutShape() { const auto& maxDims = shape.getMaxDims(); const auto& dims = shape.getDims(); for (size_t i = 0; i < dims.size() - 2; ++i) { - lastOutputSpatialDims[i] = dims[i + 2] == Shape::UNDEFINED_DIM ? std::min(maxDims[i + 2], - std::max(minDims[i + 2], static_cast(64))) : dims[i + 2]; + lastOutputSpatialDims[i] = + dims[i + 2] == Shape::UNDEFINED_DIM + ? std::min(maxDims[i + 2], std::max(minDims[i + 2], static_cast(64))) + : dims[i + 2]; } } @@ -381,14 +395,18 @@ std::pair Deconvolution::makeDummyInOutShape() { for (size_t i = 0; i < origInDims.size() - 2; i++) { if (origInDims[i + 2] == Shape::UNDEFINED_DIM && (origInMinDims[i + 2] != 0 || origInMaxDims[i + 2] != Shape::UNDEFINED_DIM)) { - // if input shape is dynamic and bounded, paddings should be computed basing on the following limitations: + // if input shape is dynamic and bounded, paddings should be computed basing on the following + // limitations: // 1. paddings must not be negative - // 2. the result padding must have such a value to keep the dummy dimensions inside the predefined interval - auto c1 = lastOutputSpatialDims[i] - deconvAttrs.outputPadding[i] - 1 - - (deconvAttrs.dilation[i] + 1) * static_cast(weightDims[wghOffset + 2 + i] - 1); + // 2. the result padding must have such a value to keep the dummy dimensions inside the + // predefined interval + auto c1 = + lastOutputSpatialDims[i] - deconvAttrs.outputPadding[i] - 1 - + (deconvAttrs.dilation[i] + 1) * static_cast(weightDims[wghOffset + 2 + i] - 1); if (origInMaxDims[i + 2] != Shape::UNDEFINED_DIM) { - auto upper_bound = deconvAttrs.stride[i] * static_cast(origInMaxDims[i + 2] - 1) - c1; + auto upper_bound = + deconvAttrs.stride[i] * static_cast(origInMaxDims[i + 2] - 1) - c1; if (upper_bound < 0) { OPENVINO_THROW(errorPrefix, ": paddings for dummy shapes can't be computed"); } @@ -404,9 +422,11 @@ std::pair Deconvolution::makeDummyInOutShape() { for (size_t i = 0; i < inputDims.size() - 2; i++) { if (origInDims[2 + i] == Shape::UNDEFINED_DIM) { - inputDims[2 + i] = (lastOutputSpatialDims[i] - (deconvAttrs.dilation[i] + 1) * - (weightDims[wghOffset + 2 + i] - 1) - 1 + paddings[i] - deconvAttrs.outputPadding[i]) / - deconvAttrs.stride[i] + 1; + inputDims[2 + i] = (lastOutputSpatialDims[i] - + (deconvAttrs.dilation[i] + 1) * (weightDims[wghOffset + 2 + i] - 1) - 1 + + paddings[i] - deconvAttrs.outputPadding[i]) / + deconvAttrs.stride[i] + + 1; } } } @@ -457,12 +477,14 @@ void Deconvolution::getSupportedDescriptors() { if (!descs.empty()) return; isInt8 = canBeExecutedInInt8(); - deconvAttrs.withBiasesParam = withBiases = externOutShape ? getOriginalInputsNumber() == 4 : getOriginalInputsNumber() == 3; + deconvAttrs.withBiasesParam = withBiases = + externOutShape ? getOriginalInputsNumber() == 4 : getOriginalInputsNumber() == 3; ov::element::Type inPrecision = getOriginalInputPrecisionAtPort(0); ov::element::Type outPrecision = getOriginalOutputPrecisionAtPort(0); if (isInt8) { - // TODO: We have to extend jit_avx512_core_x8s8s32x_deconv_fwd_kernel from oneDNN to support BF16 output data type + // TODO: We have to extend jit_avx512_core_x8s8s32x_deconv_fwd_kernel from oneDNN to support BF16 output data + // type if (ov::element::bf16 == inPrecision) inPrecision = ov::element::f32; if (ov::element::bf16 == outPrecision) @@ -476,11 +498,12 @@ void Deconvolution::getSupportedDescriptors() { auto inputDataType = DnnlExtensionUtils::ElementTypeToDataType(inPrecision); outputDataType = DnnlExtensionUtils::ElementTypeToDataType(outPrecision); if (inputDataType == memory::data_type::bf16 || outputDataType == memory::data_type::bf16) - inputDataType = outputDataType = memory::data_type::bf16; + inputDataType = outputDataType = memory::data_type::bf16; if (inputDataType == memory::data_type::f16 || outputDataType == memory::data_type::f16) - inputDataType = outputDataType = memory::data_type::f16; + inputDataType = outputDataType = memory::data_type::f16; if (!fusedWith.empty()) { - outputDataType = DnnlExtensionUtils::ElementTypeToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); + outputDataType = DnnlExtensionUtils::ElementTypeToDataType( + fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); } if (getParentEdges().size() != (withBiases ? (biasPort + 1) : biasPort)) { OPENVINO_THROW(errorPrefix, " has incorrect number of input edges"); @@ -490,7 +513,7 @@ void Deconvolution::getSupportedDescriptors() { } VectorDims inDims, outDims; std::tie(inDims, outDims) = makeDummyInOutShape(); - inShape = Shape(inDims); + inShape = Shape(inDims); outShape = Shape(outDims); initPaddingR(inShape, outShape); @@ -506,17 +529,18 @@ void Deconvolution::getSupportedDescriptors() { config.outConfs.resize(getOriginalOutputsNumber()); // ACL use same precision for all inputs config.inConfs[0].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); config.inConfs[1].setMemDesc( - creatorsMap.at(weights_format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); + creatorsMap.at(weights_format) + ->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); for (size_t i = 2; i < getParentEdges().size(); ++i) { config.inConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); } for (size_t i = 0; i < config.outConfs.size(); ++i) { config.outConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); } std::vector srcMemoryDescs; @@ -533,7 +557,8 @@ void Deconvolution::getSupportedDescriptors() { return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs); }; useACL = checkDesc(LayoutType::nspc) || checkDesc(LayoutType::ncsp); - if (useACL) return; + if (useACL) + return; #endif dnnlCompatibleWeiDims = getWeightDims(); // Construct the ONEDNN deconv OP weight shape. @@ -548,26 +573,30 @@ void Deconvolution::getSupportedDescriptors() { auto format = rank == 5 ? dnnl::memory::format_tag::ndhwc : rank == 4 ? dnnl::memory::format_tag::nhwc : dnnl::memory::format_tag::nwc; - MemoryDescPtr in_candidate = std::make_shared(getInputShapeAtPort(0), inputDataType, format); - MemoryDescPtr out_candidate = std::make_shared(getOutputShapeAtPort(0), outputDataType, format); + MemoryDescPtr in_candidate = + std::make_shared(getInputShapeAtPort(0), inputDataType, format); + MemoryDescPtr out_candidate = + std::make_shared(getOutputShapeAtPort(0), outputDataType, format); createDescriptor({in_candidate}, {out_candidate}); } else { for (auto format : getAvailableFormatsForDims(getInputShapeAtPort(0))) { - MemoryDescPtr in_candidate = std::make_shared(getInputShapeAtPort(0), inputDataType, format); - MemoryDescPtr out_candidate = std::make_shared(getOutputShapeAtPort(0), outputDataType, format); + MemoryDescPtr in_candidate = + std::make_shared(getInputShapeAtPort(0), inputDataType, format); + MemoryDescPtr out_candidate = + std::make_shared(getOutputShapeAtPort(0), outputDataType, format); createDescriptor({in_candidate}, {out_candidate}); } } } -void Deconvolution::initPaddingR(const Shape &inShape, const Shape &outShape) { +void Deconvolution::initPaddingR(const Shape& inShape, const Shape& outShape) { for (size_t i = 0; i < deconvAttrs.paddingR.size(); i++) { int with_group = getAlgorithm() == Algorithm::DeconvolutionGrouped ? 1 : 0; const auto& weightDims = getWeightDims(); int krn = weightDims[with_group + 2 + i]; int src = outShape.getStaticDims()[2 + i]; int dst = inShape.getStaticDims()[2 + i]; - krn = (krn - 1)*(deconvAttrs.dilation[i] + 1) + 1; + krn = (krn - 1) * (deconvAttrs.dilation[i] + 1) + 1; deconvAttrs.paddingR[i] = (dst - 1) * deconvAttrs.stride[i] - (src - krn + deconvAttrs.paddingL[i]); } } @@ -585,11 +614,22 @@ void Deconvolution::setPostOps(dnnl::primitive_attr& attr, const VectorDims& dim // For deconv OP, Deconv_OC = IC, Deconv_IC = OC. // Openvino per-channel weight scales are applied on IC/Deconv_OC dimension. // So for deconvolution, - // Weight dims in NON-Group deconv: [Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on Deconv_OC DIM + // Weight dims in NON-Group deconv: [Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on Deconv_OC + // DIM // weiScaleMaskPerChannel = 1 << 0 - // Weight dims in Group deconv: [Group, Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on GROUP and Deconv_OC, + // Weight dims in Group deconv: [Group, Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on + // GROUP and Deconv_OC, // weiScaleMaskPerChannel = ( 1 << 0 | 1 << 1) = 0x03 - DnnlPostOpsComposerLegacy dnnlpoc(getEngine(), attr, ops, postOpsArgs, dims, 1, isInt8, withGroups ? 3 : 1 << 0, getDQScales(), withBiases); + DnnlPostOpsComposerLegacy dnnlpoc(getEngine(), + attr, + ops, + postOpsArgs, + dims, + 1, + isInt8, + withGroups ? 3 : 1 << 0, + getDQScales(), + withBiases); for (size_t i = 0; i < fusedWith.size(); ++i) { auto& node = fusedWith[i]; @@ -634,7 +674,7 @@ bool Deconvolution::needShapeInfer() const { return false; } -VectorDims Deconvolution::shapeInferInternal(const VectorDims &inDims, std::vector outSpDims) const { +VectorDims Deconvolution::shapeInferInternal(const VectorDims& inDims, std::vector outSpDims) const { std::vector> inputShapesRefs{std::ref(inDims), std::ref(getWeightDims())}; std::unordered_map inputValues; VectorDims outSpDimsVecShape; @@ -679,7 +719,7 @@ void Deconvolution::execute(dnnl::stream strm) { for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { dstMemory.push_back(getDstMemoryAtPort(i)); } - //TODO: need to pass post ops data + // TODO: need to pass post ops data execPtrDeconvACL->exec(srcMemory, dstMemory, nullptr); return; } @@ -697,43 +737,50 @@ void Deconvolution::execute(dnnl::stream strm) { namespace { dnnl::primitive_desc createDescriptorInternal(const dnnl::memory::desc& in_candidate, - const dnnl::memory::desc& wgh_candidate, - const dnnl::memory::desc& bias_candidate, - const dnnl::memory::desc& out_candidate, - const bool with_bias, - const std::vector& stride, - const std::vector& dilation, - const ov::CoordinateDiff& paddingL, - const ov::CoordinateDiff& paddingR, - const dnnl::primitive_attr& attr, - const dnnl::engine& engine) { - auto convertDims = [] (const std::vector& orig_dims) { + const dnnl::memory::desc& wgh_candidate, + const dnnl::memory::desc& bias_candidate, + const dnnl::memory::desc& out_candidate, + const bool with_bias, + const std::vector& stride, + const std::vector& dilation, + const ov::CoordinateDiff& paddingL, + const ov::CoordinateDiff& paddingR, + const dnnl::primitive_attr& attr, + const dnnl::engine& engine) { + auto convertDims = [](const std::vector& orig_dims) { return memory::dims(orig_dims.begin(), orig_dims.end()); }; if (with_bias) { - return dnnl::deconvolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - dnnl::algorithm::deconvolution_direct, - in_candidate, wgh_candidate, bias_candidate, out_candidate, - convertDims(stride), convertDims(dilation), - convertDims(paddingL), convertDims(paddingR), - attr); + return dnnl::deconvolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + dnnl::algorithm::deconvolution_direct, + in_candidate, + wgh_candidate, + bias_candidate, + out_candidate, + convertDims(stride), + convertDims(dilation), + convertDims(paddingL), + convertDims(paddingR), + attr); } else { - return dnnl::deconvolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - dnnl::algorithm::deconvolution_direct, - in_candidate, wgh_candidate, out_candidate, - convertDims(stride), convertDims(dilation), - convertDims(paddingL), convertDims(paddingR), - attr); + return dnnl::deconvolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + dnnl::algorithm::deconvolution_direct, + in_candidate, + wgh_candidate, + out_candidate, + convertDims(stride), + convertDims(dilation), + convertDims(paddingL), + convertDims(paddingR), + attr); } } -} // namespace +} // namespace -Node::AttrPtr Deconvolution::makePrimitiveAttr(const VectorDims &dims) { +Node::AttrPtr Deconvolution::makePrimitiveAttr(const VectorDims& dims) { auto attr = std::make_shared(dnnl::primitive_attr()); setPostOps(*attr, dims); @@ -748,81 +795,61 @@ Node::AttrPtr Deconvolution::initPrimitiveAttr() { const std::vector& Deconvolution::getDefaultImplPriority() { static const std::vector priorities { impl_desc_type::unknown, - // Undef impl type is used to express use-cases there real type is unkown during compilation - // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties - impl_desc_type::undef, - impl_desc_type::brgconv_avx512_amx_1x1, - impl_desc_type::brgconv_avx512_amx, - impl_desc_type::jit_avx512_amx_dw, - impl_desc_type::jit_avx512_amx_1x1, - impl_desc_type::jit_avx512_amx, - impl_desc_type::brgconv_avx512_1x1, - impl_desc_type::brgconv_avx512, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::brgconv_avx2_1x1, - impl_desc_type::brgconv_avx2, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, + // Undef impl type is used to express use-cases there real type is unkown during compilation + // Undef has higher priority than defined types in order to force primitive selection logic to make decision + // based on other properties + impl_desc_type::undef, impl_desc_type::brgconv_avx512_amx_1x1, impl_desc_type::brgconv_avx512_amx, + impl_desc_type::jit_avx512_amx_dw, impl_desc_type::jit_avx512_amx_1x1, impl_desc_type::jit_avx512_amx, + impl_desc_type::brgconv_avx512_1x1, impl_desc_type::brgconv_avx512, impl_desc_type::jit_avx512_dw, + impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, impl_desc_type::brgconv_avx2_1x1, + impl_desc_type::brgconv_avx2, impl_desc_type::jit_uni_dw, impl_desc_type::jit_uni_1x1, + impl_desc_type::jit_uni, impl_desc_type::jit_avx2_dw, impl_desc_type::jit_avx2_1x1, + impl_desc_type::jit_avx2, impl_desc_type::jit_avx_dw, impl_desc_type::jit_avx_1x1, impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, impl_desc_type::jit_sse42_1x1, impl_desc_type::jit_sse42, #if defined(OPENVINO_ARCH_ARM64) - impl_desc_type::jit_asimd, + impl_desc_type::jit_asimd, #endif - impl_desc_type::gemm_any, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::gemm_acl, - impl_desc_type::acl, - impl_desc_type::jit_gemm, - impl_desc_type::ref_any, - impl_desc_type::ref, + impl_desc_type::gemm_any, impl_desc_type::gemm_blas, impl_desc_type::gemm_avx512, impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, impl_desc_type::gemm_acl, impl_desc_type::acl, + impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref, }; if (!asymmetricPaddingAnd1x1) return priorities; static const std::vector priorities_wo_brgemm = [&] { - std::vectorresult; - std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), - [](impl_desc_type type) { return !(type & impl_desc_type::brgconv); }); - return result;}(); + std::vector result; + std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), [](impl_desc_type type) { + return !(type & impl_desc_type::brgconv); + }); + return result; + }(); return priorities_wo_brgemm; } bool Deconvolution::isImplicit1x1PaddingAsymmetric(const VectorDims& inputDims) { - auto isZero = [](std::ptrdiff_t i) { return i == 0; }; + auto isZero = [](std::ptrdiff_t i) { + return i == 0; + }; size_t spatialRank = getInputShapeAtPort(0).getRank() - 2; - if (is1x1 && std::all_of(deconvAttrs.paddingR.begin(), deconvAttrs.paddingR.end(), isZero) - && std::all_of(deconvAttrs.paddingL.begin(), deconvAttrs.paddingL.end(), isZero) - && std::all_of(deconvAttrs.outputPadding.begin(), deconvAttrs.outputPadding.end(), isZero) - ) { - auto calPaddingEnd = [](int64_t i, int64_t o, int64_t s) -> int64_t { - // Accoriding to https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html, - // output[i] = (input[i] -1) * stride[i] - 2 x padding[i] + dilation[i] x (kernel_size[i] - 1) + output_padding[i] + 1. - // When kernel_size[i] = 1, output_padding = 0, output[i] = (input[i] -1) * stride[i] - 2 x padding[i] + 1. - // implicit padding end = 2 x padding[i] = (input[i] -1) * stride[i] + 1 - output[i] - return (i - 1) * s + 1 - o;}; - for (size_t i = 0; i < spatialRank; i++) { - int64_t inputDim = static_cast(inputDims[i + 2]); - int64_t outputDim = static_cast(lastOutputSpatialDims[i]); - int64_t stride = static_cast(deconvAttrs.stride[i]); - if (calPaddingEnd(inputDim, outputDim, stride) > 0) { - return true; - } + if (is1x1 && std::all_of(deconvAttrs.paddingR.begin(), deconvAttrs.paddingR.end(), isZero) && + std::all_of(deconvAttrs.paddingL.begin(), deconvAttrs.paddingL.end(), isZero) && + std::all_of(deconvAttrs.outputPadding.begin(), deconvAttrs.outputPadding.end(), isZero)) { + auto calPaddingEnd = [](int64_t i, int64_t o, int64_t s) -> int64_t { + // Accoriding to https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html, + // output[i] = (input[i] -1) * stride[i] - 2 x padding[i] + dilation[i] x (kernel_size[i] - 1) + + // output_padding[i] + 1. When kernel_size[i] = 1, output_padding = 0, output[i] = (input[i] -1) * stride[i] + // - 2 x padding[i] + 1. implicit padding end = 2 x padding[i] = (input[i] -1) * stride[i] + 1 - output[i] + return (i - 1) * s + 1 - o; + }; + for (size_t i = 0; i < spatialRank; i++) { + int64_t inputDim = static_cast(inputDims[i + 2]); + int64_t outputDim = static_cast(lastOutputSpatialDims[i]); + int64_t stride = static_cast(deconvAttrs.stride[i]); + if (calPaddingEnd(inputDim, outputDim, stride) > 0) { + return true; } + } } return false; } @@ -855,8 +882,10 @@ void Deconvolution::prepareParams() { dstMemoryDescs.push_back(getChildEdgeAt(i)->getMemory().getDescWithType()); } - execPtrDeconvACL = selected_pd->getExecutorFactoryAs()->makeExecutor(deconvAttrs, srcMemoryDescs, - dstMemoryDescs, *attr); + execPtrDeconvACL = selected_pd->getExecutorFactoryAs()->makeExecutor(deconvAttrs, + srcMemoryDescs, + dstMemoryDescs, + *attr); selected_pd->setImplementationType(execPtrDeconvACL->getImplType()); return; } @@ -892,7 +921,7 @@ void Deconvolution::prepareParams() { OPENVINO_THROW("Bias memory memory is undefined."); biasDesc = biasMemPtr->getDescWithType(); } - bool is1x1PaddingAsymmetric = false; + bool is1x1PaddingAsymmetric = false; if (externOutShape && (!isConstOutShape || isDynamicNode())) { // Check implicit asymmetric padding case for dynamic case and runtime output shape. is1x1PaddingAsymmetric = isImplicit1x1PaddingAsymmetric(getSrcMemoryAtPort(0)->getShape().getStaticDims()); @@ -918,34 +947,41 @@ void Deconvolution::prepareParams() { dnnl::memory::desc dnnlBiasDesc; const auto& weiDims = key.inp1->getShape().getStaticDims(); const auto srcDataType = key.inp0->getDataType(); - const auto weiDataType = (one_of(srcDataType, memory::data_type::s8, memory::data_type::u8)) ? - memory::data_type::s8 : srcDataType; + const auto weiDataType = + (one_of(srcDataType, memory::data_type::s8, memory::data_type::u8)) ? memory::data_type::s8 : srcDataType; auto wghDescAny = - dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(weiDims), - weiDataType, - memory::format_tag::any); + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(weiDims), weiDataType, memory::format_tag::any); if (key.bias) dnnlBiasDesc = key.bias->getDnnlDesc(); - desc = createDescriptorInternal(key.inp0->getDnnlDesc(), wghDescAny, dnnlBiasDesc, key.out->getDnnlDesc(), - key.bias != nullptr, key.stride, key.dilation, key.paddingL, key.paddingR, key.attr, engine); + desc = createDescriptorInternal(key.inp0->getDnnlDesc(), + wghDescAny, + dnnlBiasDesc, + key.out->getDnnlDesc(), + key.bias != nullptr, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + key.attr, + engine); primitive_desc_iterator itpd = desc; executorPtr execPtr = nullptr; while (static_cast(itpd)) { impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - //Skip the brgemm implemenation for asymmetric padding case because of the accuracy issue. + // Skip the brgemm implemenation for asymmetric padding case because of the accuracy issue. if (key.isImplicit1x1PaddingAsymmetric && (impl_type & impl_desc_type::brgconv)) continue; if (impl_type == key.implType) { auto prim_desc = deconvolution_forward::primitive_desc(itpd.get()); execPtr = std::make_shared(prim_desc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); break; } @@ -955,16 +991,27 @@ void Deconvolution::prepareParams() { } if (!execPtr) { - auto inDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp0->getShape().getStaticDims()), - key.inp0->getDataType(), - memory::format_tag::any); - auto outDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.out->getShape().getStaticDims()), - key.out->getDataType(), - memory::format_tag::any); + auto inDesc = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp0->getShape().getStaticDims()), + key.inp0->getDataType(), + memory::format_tag::any); + auto outDesc = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.out->getShape().getStaticDims()), + key.out->getDataType(), + memory::format_tag::any); dnnl::primitive_desc anyDeconvDesc; - anyDeconvDesc = createDescriptorInternal(inDesc, wghDescAny, dnnlBiasDesc, outDesc, key.bias != nullptr, - key.stride, key.dilation, key.paddingL, key.paddingR, key.attr, engine); + anyDeconvDesc = createDescriptorInternal(inDesc, + wghDescAny, + dnnlBiasDesc, + outDesc, + key.bias != nullptr, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + key.attr, + engine); if (anyDeconvDesc) { auto prim_desc = deconvolution_forward::primitive_desc(anyDeconvDesc.get()); execPtr = std::make_shared(prim_desc, @@ -984,13 +1031,12 @@ void Deconvolution::prepareParams() { auto cache = context->getParamsCache(); auto result = cache->getOrCreate(key, builder); - execPtr = result.first; if (!execPtr) OPENVINO_THROW("Primitive descriptor was not found for node ", getName(), "."); primArgs[DNNL_ARG_SRC] = srcMemPtr->getPrimitive(); - primArgs[DNNL_ARG_DST]= dstMemPtr->getPrimitive(); + primArgs[DNNL_ARG_DST] = dstMemPtr->getPrimitive(); if (weightIsConst) { // const weight preparation/reordering needs to be done once at next execution // when the input weight data is guaranteed to be ready (considering possible const-folding @@ -1018,8 +1064,8 @@ void Deconvolution::prepareParams() { #endif } -void Deconvolution::createDescriptor(const std::vector &inputDesc, - const std::vector &outputDesc) { +void Deconvolution::createDescriptor(const std::vector& inputDesc, + const std::vector& outputDesc) { auto inDesc = inputDesc[0]->isDefined() ? inputDesc[0] : inputDesc[0]->cloneWithNewDims(inShape.getStaticDims()); auto dnnlInDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(*inDesc); const auto& in_candidate = dnnlInDesc.getDnnlDesc(); @@ -1040,26 +1086,38 @@ void Deconvolution::createDescriptor(const std::vector &inputDesc AttrPtr attr = initPrimitiveAttr(); if (withBiases) { memory::data_type bdt = memory::data_type::f32; - bias_candidate = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); + bias_candidate = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); } - dnnl::memory::desc wgh_candidate(DnnlExtensionUtils::convertToDnnlDims(dnnlCompatibleWeiDims), isInt8 ? memory::data_type::s8 : dnnlInDesc.getDataType(), - memory::format_tag::any); - descs.emplace_back(createDescriptorInternal(in_candidate, wgh_candidate, bias_candidate, - out_candidate, withBiases, deconvAttrs.stride, deconvAttrs.dilation, - deconvAttrs.paddingL, deconvAttrs.paddingR, *attr, getEngine())); + dnnl::memory::desc wgh_candidate(DnnlExtensionUtils::convertToDnnlDims(dnnlCompatibleWeiDims), + isInt8 ? memory::data_type::s8 : dnnlInDesc.getDataType(), + memory::format_tag::any); + descs.emplace_back(createDescriptorInternal(in_candidate, + wgh_candidate, + bias_candidate, + out_candidate, + withBiases, + deconvAttrs.stride, + deconvAttrs.dilation, + deconvAttrs.paddingL, + deconvAttrs.paddingR, + *attr, + getEngine())); } -std::shared_ptr Deconvolution::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +std::shared_ptr Deconvolution::getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (idx == 2 && !withBiases) { - //Expected dest shape; + // Expected dest shape; return std::make_shared(ov::element::i32, Shape(getInputShapeAtPort(2).getStaticDims())); } else if (idx > 0) { // weight and bias are exposed with the planar layout. // we need to store 'weight' input as edge, - // because at this moment we can't simple replace internal blob with input, since we need to save weight data as is, but with different order - return std::make_shared(getOriginalInputPrecisionAtPort(idx), Shape(getInputShapeAtPort(idx).getStaticDims())); + // because at this moment we can't simple replace internal blob with input, since we need to save weight data as + // is, but with different order + return std::make_shared(getOriginalInputPrecisionAtPort(idx), + Shape(getInputShapeAtPort(idx).getStaticDims())); } - //idx =0 case + // idx =0 case auto desc = prim_desc.src_desc(idx); if (getInputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(desc, getInputShapeAtPort(idx)); @@ -1067,8 +1125,8 @@ std::shared_ptr Deconvolution::getSrcMemDesc(const dnnl::primitive_d return DnnlExtensionUtils::makeDescriptor(desc); } -std::shared_ptr Deconvolution::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { - auto desc = prim_desc.dst_desc(idx); +std::shared_ptr Deconvolution::getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { + auto desc = prim_desc.dst_desc(idx); if (getOutputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(desc, getOutputShapeAtPort(idx)); } @@ -1082,7 +1140,8 @@ ov::element::Type Deconvolution::getRuntimePrecision() const { for (size_t i = 0; i < std::min(getParentEdges().size(), inputsNumLimit); i++) { auto parentEdge = getParentEdgeAt(i); if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated) { - inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); + inputPrecisions.emplace_back( + DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); } } @@ -1090,11 +1149,12 @@ ov::element::Type Deconvolution::getRuntimePrecision() const { } Deconvolution::DeconvDNNLExecutor::DeconvDNNLExecutor(const dnnl::deconvolution_forward::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight) : DnnlExecutor(pd) { + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight) + : DnnlExecutor(pd) { if (inMemDesc != getDnnlSrcDesc()) { inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)}); } @@ -1112,7 +1172,7 @@ std::vector Deconvolution::readOutputSpatialDims() const { if (getParentEdges().size() < 3) { OPENVINO_THROW("Can't get output spatial dims. Inputs number = ", getParentEdges().size()); } - const auto &shapeMemPtr = getSrcMemoryAtPort(2); + const auto& shapeMemPtr = getSrcMemoryAtPort(2); if (!shapeMemPtr || !shapeMemPtr->isDefined()) { OPENVINO_THROW("'output_shape' input memory is undefined."); } @@ -1120,20 +1180,20 @@ std::vector Deconvolution::readOutputSpatialDims() const { if (shapeMemPtr->getStaticDims()[0] != spDimsNum) { OPENVINO_THROW("Can't read output spatial dims, beause 'output_shape' input has incorrect number of elements"); } - const int32_t *outShapePtr = shapeMemPtr->getDataAs(); + const int32_t* outShapePtr = shapeMemPtr->getDataAs(); std::vector outSpDims(outShapePtr, outShapePtr + shapeMemPtr->getStaticDims()[0]); return outSpDims; } bool Deconvolution::canFuseBias() const { - //ONEDNN deconvolution_fwd_t primitive can support bias fusing. but has different implementations. - //For the brgdeconv implementation in the deconv list, bias is implemented via JIT kernel. - //For the fall back ref implementation entry(previous conv_backward_data), bias is implemented via reference post-ops. - //It is difficult to recognize whether the deconv will run with brg or fall back to backwards data implementation on the fusing - //transformation stage. In the end, all the deconv should run with brg implement. - //And in model zoo only limited deconv has bias or other post-ops in IR. - //Based on above, enable the bias fusing for all deconv implementations. - return (externOutShape ? getParentEdges().size() == 3 : getParentEdges().size() == 2); + // ONEDNN deconvolution_fwd_t primitive can support bias fusing. but has different implementations. + // For the brgdeconv implementation in the deconv list, bias is implemented via JIT kernel. + // For the fall back ref implementation entry(previous conv_backward_data), bias is implemented via reference + // post-ops. It is difficult to recognize whether the deconv will run with brg or fall back to backwards data + // implementation on the fusing transformation stage. In the end, all the deconv should run with brg implement. And + // in model zoo only limited deconv has bias or other post-ops in IR. Based on above, enable the bias fusing for all + // deconv implementations. + return (externOutShape ? getParentEdges().size() == 3 : getParentEdges().size() == 2); } void Deconvolution::initSupportedPrimitiveDescriptors() { @@ -1144,7 +1204,7 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { VectorDims inDims, outDims; std::tie(inDims, outDims) = makeDummyInOutShape(); - auto tmpInShape = Shape(inDims); + auto tmpInShape = Shape(inDims); auto tmpOutShape = Shape(outDims); initPaddingR(tmpInShape, tmpOutShape); @@ -1155,18 +1215,19 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { config.outConfs.resize(getOriginalOutputsNumber()); config.inConfs[0].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); config.inConfs[1].setMemDesc( - creatorsMap.at(weights_format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); + creatorsMap.at(weights_format) + ->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); for (size_t i = 2; i < getParentEdges().size(); ++i) { config.inConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); } for (size_t i = 0; i < config.outConfs.size(); ++i) { config.outConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); } std::vector srcMemoryDescs; @@ -1180,8 +1241,11 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()->clone()); } - auto factory = std::make_shared(deconvAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getImplPriority())); + auto factory = + std::make_shared(deconvAttrs, + srcMemoryDescs, + dstMemoryDescs, + std::make_shared(context, getImplPriority())); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::gemm_acl, factory); }; @@ -1189,7 +1253,6 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { pushDesc(LayoutType::ncsp); } - -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/deconv.h b/src/plugins/intel_cpu/src/nodes/deconv.h index d94bcd8bcaca13..1c3e1fe8978918 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.h +++ b/src/plugins/intel_cpu/src/nodes/deconv.h @@ -29,27 +29,32 @@ class Deconvolution : public Node { return static_cast(getParentEdges().size()); } - std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; - std::shared_ptr getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; + std::shared_ptr getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; ov::element::Type getRuntimePrecision() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; bool canFuse(const NodePtr& node) const override; - const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); } - const std::vector& getStride() const { return deconvAttrs.stride; } + const VectorDims& getWeightDims() const { + return getInputShapeAtPort(1).getStaticDims(); + } + const std::vector& getStride() const { + return deconvAttrs.stride; + } void prepareParams() override; void execute(dnnl::stream strm) override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } bool needShapeInfer() const override; bool canFuseBias() const; bool canBeExecutedInInt8() const override; const std::vector& getDefaultImplPriority() override; - protected: AttrPtr initPrimitiveAttr() override; AttrPtr makePrimitiveAttr(const VectorDims& dims); @@ -60,13 +65,13 @@ class Deconvolution : public Node { using executorPtr = std::shared_ptr; executorPtr execPtr = nullptr; class DeconvDNNLExecutor : public DnnlExecutor { - public: - DeconvDNNLExecutor(const dnnl::deconvolution_forward::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight); + public: + DeconvDNNLExecutor(const dnnl::deconvolution_forward::primitive_desc& pd, + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight); }; bool isImplicit1x1PaddingAsymmetric(const VectorDims& inputDims); @@ -79,8 +84,8 @@ class Deconvolution : public Node { size_t IC = 0; size_t OC = 0; std::vector lastOutputSpatialDims; - VectorDims dnnlCompatibleWeiDims {}; - VectorDims expectedBiasDims {}; + VectorDims dnnlCompatibleWeiDims{}; + VectorDims expectedBiasDims{}; bool useACL = false; DeconvAttrs deconvAttrs; @@ -93,9 +98,9 @@ class Deconvolution : public Node { MemoryPtr dnnlCompatibleWeights = nullptr; std::shared_ptr attr; - void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims); - VectorDims shapeInferInternal(const VectorDims &inDims, std::vector outSpDims) const; - void initPaddingR(const Shape &inShape, const Shape &outShape); + void setPostOps(dnnl::primitive_attr& attr, const VectorDims& dims); + VectorDims shapeInferInternal(const VectorDims& inDims, std::vector outSpDims) const; + void initPaddingR(const Shape& inShape, const Shape& outShape); std::vector readOutputSpatialDims() const; std::pair makeDummyInOutShape(); bool withBiases = false; @@ -110,6 +115,6 @@ class Deconvolution : public Node { bool isConstOutShape = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.cpp b/src/plugins/intel_cpu/src/nodes/def_conv.cpp index 0167a18673c444..7c5427d0def045 100644 --- a/src/plugins/intel_cpu/src/nodes/def_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/def_conv.cpp @@ -4,21 +4,20 @@ #include "def_conv.h" -#include +#include +#include +#include #include #include -#include -#include "openvino/core/parallel.hpp" -#include "memory_desc/dnnl_blocked_memory_desc.h" #include "common/primitive_hashing_utils.hpp" -#include "openvino/util/pp.hpp" - -#include "dnnl_types.h" -#include "dnnl_extension_utils.h" #include "cpu/x64/jit_generator.hpp" -#include +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "openvino/core/parallel.hpp" +#include "openvino/util/pp.hpp" using namespace dnnl; using namespace dnnl::impl; @@ -30,7 +29,7 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_def_conv_call_args, field) +# define GET_OFF(field) offsetof(jit_def_conv_call_args, field) template struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_generator { @@ -38,7 +37,9 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ constexpr static int sampledPointsPerPixel = DeformableConvolution::sampledPointsPerPixel; - explicit jit_uni_def_conv_kernel_f32(const jit_def_conv_params& jcp) : jit_uni_def_conv_kernel(jcp), jit_generator(jit_name()) {} + explicit jit_uni_def_conv_kernel_f32(const jit_def_conv_params& jcp) + : jit_uni_def_conv_kernel(jcp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -72,8 +73,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; const int vlen = cpu_isa_traits::vlen; using Ymm = const Xbyak::Ymm; @@ -113,18 +114,29 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Xbyak::Opmask ktail_mask = Xbyak::Opmask(2); - inline Xbyak::Address table_val(int index) - { return ptr[reg_table + index * vlen]; } + inline Xbyak::Address table_val(int index) { + return ptr[reg_table + index * vlen]; + } - inline Vmm get_vmm_ker(int idx) { return Vmm(idx + 0); } - inline Vmm get_vmm_src(int idx) { return Vmm(idx + 1); } - inline Vmm get_vmm_acc(int idx) { return Vmm(idx + jcp_.ur_w + 1); } - inline Ymm get_ymm_acc(int idx) { return Ymm(idx + jcp_.ur_w + 1); } - inline Xmm get_xmm_acc(int idx) { return Xmm(idx + jcp_.ur_w + 1); } + inline Vmm get_vmm_ker(int idx) { + return Vmm(idx + 0); + } + inline Vmm get_vmm_src(int idx) { + return Vmm(idx + 1); + } + inline Vmm get_vmm_acc(int idx) { + return Vmm(idx + jcp_.ur_w + 1); + } + inline Ymm get_ymm_acc(int idx) { + return Ymm(idx + jcp_.ur_w + 1); + } + inline Xmm get_xmm_acc(int idx) { + return Xmm(idx + jcp_.ur_w + 1); + } Xbyak::Label l_table; - inline void checkZeroWei(const Xbyak::Xmm &x1, Label &nullifyLabel) { + inline void checkZeroWei(const Xbyak::Xmm& x1, Label& nullifyLabel) { ptest(x1, x1); jz(nullifyLabel); } @@ -135,13 +147,16 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ mov(reg_ow_pos, 0); - L(ow_loop_main); { + L(ow_loop_main); + { cmp(reg_ow_pos, jcp_.ow - jcp_.ur_w); jg(ow_tail, T_NEAR); oc_loop(jcp_.ur_w); - add(reg_sampled_wei, jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_wei); // type = float - add(reg_sampled_offs, jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_offsets); // type = int + add(reg_sampled_wei, + jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_wei); // type = float + add(reg_sampled_offs, + jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_offsets); // type = int add(reg_output, jcp_.ur_w * jcp_.oc * jcp_.typesize_out); @@ -149,7 +164,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(ow_loop_main, T_NEAR); } - L(ow_tail); { + L(ow_tail); + { if (jcp_.ow % jcp_.ur_w != 0) oc_loop(jcp_.ow % jcp_.ur_w); } @@ -191,7 +207,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ for (int ic = 0; ic < ic_step; ic++) { for (int ow = 0; ow < ow_step; ow++) { Vmm vmm_src = get_vmm_src(ow); - size_t inp_off = (size_t) ow * jcp_.kh * jcp_.kw * jcp_.ic + kh * jcp_.kw * jcp_.ic + kw * jcp_.ic + ic; + size_t inp_off = + (size_t)ow * jcp_.kh * jcp_.kw * jcp_.ic + kh * jcp_.kw * jcp_.ic + kw * jcp_.ic + ic; uni_vbroadcastss(vmm_src, ptr[aux2_reg_input_buffer + inp_off * jcp_.typesize_in]); } @@ -199,10 +216,10 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ for (int r = 0; r < repeats; r++) { for (int ocb = 0; ocb < oc_blocks_step; ocb++) { Vmm vmm_ker = get_vmm_ker(0); - size_t ker_off = (size_t) ocb * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + - kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + - kw * jcp_.ic_block * jcp_.oc_block + - ic * jcp_.oc_block + r * jcp_.oc_block / 2; + size_t ker_off = + (size_t)ocb * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + + kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + kw * jcp_.ic_block * jcp_.oc_block + + ic * jcp_.oc_block + r * jcp_.oc_block / 2; uni_vmovups(vmm_ker, ptr[aux2_reg_kernel + ker_off * jcp_.typesize_in]); for (int ow = 0; ow < ow_step; ow++) { @@ -248,7 +265,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ init_accums(ow_step, oc_blocks_step, oc_step); - L(ic_main_loop); { + L(ic_main_loop); + { cmp(reg_ic_iter, jcp_.ic_block); jl(ic_tail, T_NEAR); @@ -259,7 +277,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(ic_main_loop, T_NEAR); } - L(ic_tail); { + L(ic_tail); + { if (jcp_.ic % jcp_.ic_block != 0) { apply_filter(ow_step, oc_blocks_step, oc_step, jcp_.ic % jcp_.ic_block); } @@ -283,7 +302,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ xor_(reg_dg_iter, reg_dg_iter); const int ic_per_def_group = jcp_.ic / jcp_.dg; - L(dg_loop); { + L(dg_loop); + { cmp(reg_dg_iter, jcp_.dg); jge(dg_loop_end, T_NEAR); @@ -326,7 +346,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Xmm xmm_w4 = Xmm(5); Xmm xmm_v1 = Xmm(2); - Xmm xmm_v2 = Xmm(3);; + Xmm xmm_v2 = Xmm(3); + ; Xmm xmm_v3 = Xmm(6); Xmm xmm_v4 = Xmm(7); @@ -341,7 +362,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Vmm vmm_v4 = Vmm(xmm_v4.getIdx()); // offsets computation - size_t ind_off_hh = sampledPointsPerPixel * (((size_t) kh * jcp_.kw + kw) + ow * (jcp_.kh * jcp_.kw)); + size_t ind_off_hh = + sampledPointsPerPixel * (((size_t)kh * jcp_.kw + kw) + ow * (jcp_.kh * jcp_.kw)); size_t ind_off_hl = ind_off_hh + 1; size_t ind_off_lh = ind_off_hl + 1; size_t ind_off_ll = ind_off_lh + 1; @@ -366,12 +388,16 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jl(ic_loop_tail, T_NEAR); // check zero markers - uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v1, + dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v2, + dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v3, + dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v4, + dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); - size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; + size_t input_buffer_off = (size_t)kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; uni_vpmovsxdq(xmm_v1_off, xmm_v1_off); uni_vmovq(reg_tmp_64, xmm_v1_off); @@ -382,9 +408,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v1, vmm_v1, vmm_w1); jmp(nullify_v1_end, T_NEAR); L(nullify_v1); - { - uni_vpxor(vmm_v1, vmm_v1, vmm_v1); - } + { uni_vpxor(vmm_v1, vmm_v1, vmm_v1); } L(nullify_v1_end); uni_vpmovsxdq(xmm_v2_off, xmm_v2_off); @@ -396,9 +420,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v2, vmm_v2, vmm_w2); jmp(nullify_v2_end, T_NEAR); L(nullify_v2); - { - uni_vpxor(vmm_v2, vmm_v2, vmm_v2); - } + { uni_vpxor(vmm_v2, vmm_v2, vmm_v2); } L(nullify_v2_end); uni_vpmovsxdq(xmm_v3_off, xmm_v3_off); @@ -410,9 +432,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v3, vmm_v3, vmm_w3); jmp(nullify_v3_end, T_NEAR); L(nullify_v3); - { - uni_vpxor(vmm_v3, vmm_v3, vmm_v3); - } + { uni_vpxor(vmm_v3, vmm_v3, vmm_v3); } L(nullify_v3_end); uni_vpmovsxdq(xmm_v4_off, xmm_v4_off); @@ -424,9 +444,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v4, vmm_v4, vmm_w4); jmp(nullify_v4_end, T_NEAR); L(nullify_v4); - { - uni_vpxor(vmm_v4, vmm_v4, vmm_v4); - } + { uni_vpxor(vmm_v4, vmm_v4, vmm_v4); } L(nullify_v4_end); uni_vaddps(vmm_v1, vmm_v1, vmm_v2); @@ -446,12 +464,16 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jl(loop_end, T_NEAR); // check zero markers - uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); - - size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; + uni_vbroadcastss(xmm_v1, + dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v2, + dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v3, + dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v4, + dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); + + size_t input_buffer_off = (size_t)kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; uni_vpmovsxdq(xmm_v1_off, xmm_v1_off); uni_vmovq(reg_tmp_64, xmm_v1_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); @@ -461,9 +483,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v1, xmm_v1, xmm_w1); jmp(nullify_v1_end_tail, T_NEAR); L(nullify_v1_tail); - { - uni_vpxor(xmm_v1, xmm_v1, xmm_v1); - } + { uni_vpxor(xmm_v1, xmm_v1, xmm_v1); } L(nullify_v1_end_tail); uni_vpmovsxdq(xmm_v2_off, xmm_v2_off); @@ -475,9 +495,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v2, xmm_v2, xmm_w2); jmp(nullify_v2_end_tail, T_NEAR); L(nullify_v2_tail); - { - uni_vpxor(xmm_v2, xmm_v2, xmm_v2); - } + { uni_vpxor(xmm_v2, xmm_v2, xmm_v2); } L(nullify_v2_end_tail); uni_vpmovsxdq(xmm_v3_off, xmm_v3_off); @@ -489,9 +507,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v3, xmm_v3, xmm_w3); jmp(nullify_v3_end_tail, T_NEAR); L(nullify_v3_tail); - { - uni_vpxor(xmm_v3, xmm_v3, xmm_v3); - } + { uni_vpxor(xmm_v3, xmm_v3, xmm_v3); } L(nullify_v3_end_tail); uni_vpmovsxdq(xmm_v4_off, xmm_v4_off); @@ -503,9 +519,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v4, xmm_v4, xmm_w4); jmp(nullify_v4_end_tail, T_NEAR); L(nullify_v4_tail); - { - uni_vpxor(xmm_v4, xmm_v4, xmm_v4); - } + { uni_vpxor(xmm_v4, xmm_v4, xmm_v4); } L(nullify_v4_end_tail); uni_vaddss(xmm_v1, xmm_v1, xmm_v2); @@ -524,8 +538,10 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } } - add(aux_reg_sampled_wei, sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_wei); - add(aux_reg_sampled_offs, sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_offsets); + add(aux_reg_sampled_wei, + sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_wei); + add(aux_reg_sampled_offs, + sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_offsets); add(aux_reg_input, ic_per_def_group * jcp_.typesize_in); add(aux2_reg_input_buffer, ic_per_def_group * jcp_.typesize_in); inc(reg_dg_iter); @@ -542,7 +558,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ if (jcp_.with_bias) { for (int r = 0; r < repeats; r++) { for (int ocb = 0; ocb < oc_blocks_step; ocb++) { - size_t bias_off = (size_t) ocb * jcp_.oc_block + r * jcp_.oc_block / 2; + size_t bias_off = (size_t)ocb * jcp_.oc_block + r * jcp_.oc_block / 2; uni_vmovups(Vmm(0), ptr[aux_reg_bias + bias_off * jcp_.typesize_bia]); for (int ow = 0; ow < ow_step; ow++) { @@ -560,7 +576,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } for (int r = 0; r < repeats; r++) { - int tail_size = isa == cpu::x64::sse41 ? std::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; + int tail_size = + isa == cpu::x64::sse41 ? std::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; bool is_scalar_store = isa == cpu::x64::sse41 ? tail_size < jcp_.oc_block / 2 : tail_size < jcp_.oc_block; if (is_scalar_store) { for (int ow = 0; ow < ow_step; ow++) { @@ -568,11 +585,11 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Xmm xmm_dst = get_xmm_acc(r * jcp_.ur_w * jcp_.nb_oc_blocking + ow); if (isa == avx512_core) { - size_t out_off = (size_t) ow * jcp_.oc; + size_t out_off = (size_t)ow * jcp_.oc; uni_vmovups(ptr[aux_reg_output + out_off * jcp_.typesize_out], vmm_dst | ktail_mask); } else { for (int oc = 0; oc < tail_size; oc++) { - size_t out_off = (size_t) ow * jcp_.oc + oc + r * (jcp_.oc_block / 2); + size_t out_off = (size_t)ow * jcp_.oc + oc + r * (jcp_.oc_block / 2); uni_vmovq(reg_tmp_64, xmm_dst); mov(ptr[aux_reg_output + out_off * jcp_.typesize_out], reg_tmp_32); @@ -593,7 +610,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ for (int ocb = 0; ocb < oc_blocks_step; ocb++) { for (int ow = 0; ow < ow_step; ow++) { Vmm vmm_acc = get_vmm_acc(r * jcp_.ur_w * jcp_.nb_oc_blocking + ocb * ow_step + ow); - size_t out_off = (size_t) ow * jcp_.oc * jcp_.ngroups + ocb * jcp_.oc_block + r * (jcp_.oc_block / 2); + size_t out_off = + (size_t)ow * jcp_.oc * jcp_.ngroups + ocb * jcp_.oc_block + r * (jcp_.oc_block / 2); uni_vmovups(ptr[aux_reg_output + out_off * jcp_.typesize_out], vmm_acc); } } @@ -629,14 +647,17 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ mov(aux_reg_bias, reg_bias); mov(reg_oc_work, jcp_.oc); - L(oc_unrolled_loop); { + L(oc_unrolled_loop); + { cmp(reg_oc_work, jcp_.nb_oc_blocking * jcp_.oc_block); jl(oc_main_loop, T_NEAR); ic_loop(ow_step, jcp_.nb_oc_blocking, jcp_.oc_block); store_output(ow_step, jcp_.nb_oc_blocking, jcp_.oc_block); - add(aux_reg_kernel, jcp_.nb_oc_blocking * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block * jcp_.typesize_in); + add(aux_reg_kernel, + jcp_.nb_oc_blocking * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block * + jcp_.typesize_in); add(aux_reg_output, jcp_.nb_oc_blocking * jcp_.oc_block * jcp_.typesize_out); add(aux_reg_bias, jcp_.nb_oc_blocking * jcp_.oc_block * jcp_.typesize_bia); sub(reg_oc_work, jcp_.nb_oc_blocking * jcp_.oc_block); @@ -644,7 +665,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(oc_unrolled_loop, T_NEAR); } - L(oc_main_loop); { + L(oc_main_loop); + { cmp(reg_oc_work, jcp_.oc_block); jl(oc_tail, T_NEAR); @@ -659,7 +681,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(oc_main_loop, T_NEAR); } - L(oc_tail); { + L(oc_tail); + { if (jcp_.oc % jcp_.oc_block != 0) { ic_loop(ow_step, 1, jcp_.oc % jcp_.oc_block); store_output(ow_step, 1, jcp_.oc % jcp_.oc_block); @@ -672,11 +695,12 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } }; #endif -bool DeformableConvolution::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool DeformableConvolution::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (!one_of(op->get_type_info(), - ov::op::v1::DeformableConvolution::get_type_info_static(), - ov::op::v8::DeformableConvolution::get_type_info_static())) { + ov::op::v1::DeformableConvolution::get_type_info_static(), + ov::op::v8::DeformableConvolution::get_type_info_static())) { errorMessage = "Node is not an instance of DeformableConvolution form the operation set v1 or v8."; return false; } @@ -721,16 +745,16 @@ size_t DefConvKey::hash() const { return seed; } -bool DefConvKey::operator==(const DefConvKey &rhs) const { +bool DefConvKey::operator==(const DefConvKey& rhs) const { bool retVal = true; for (size_t i = 0; i < descVector.size(); i++) { if (descVector[i] != rhs.descVector[i]) { retVal = retVal && descVector[i] && rhs.descVector[i] && - descVector[i]->getBlockDims() == rhs.descVector[i]->getBlockDims() && - descVector[i]->getStrides() == rhs.descVector[i]->getStrides() && - descVector[i]->getOrder() == rhs.descVector[i]->getOrder() && - descVector[i]->getOffsetPaddingToData() == rhs.descVector[i]->getOffsetPaddingToData() && - descVector[i]->getOffsetPadding() == rhs.descVector[i]->getOffsetPadding(); + descVector[i]->getBlockDims() == rhs.descVector[i]->getBlockDims() && + descVector[i]->getStrides() == rhs.descVector[i]->getStrides() && + descVector[i]->getOrder() == rhs.descVector[i]->getOrder() && + descVector[i]->getOffsetPaddingToData() == rhs.descVector[i]->getOffsetPaddingToData() && + descVector[i]->getOffsetPadding() == rhs.descVector[i]->getOffsetPadding(); } } @@ -742,7 +766,7 @@ bool DefConvKey::operator==(const DefConvKey &rhs) const { return retVal; } -} // namespace +} // namespace DeformableConvolution::DeformableConvolution(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op)) { @@ -825,13 +849,14 @@ void DeformableConvolution::initSupportedPrimitiveDescriptors() { impl_desc_type impl_type; const int simd_w = mayiuse(cpu::x64::avx512_core) ? 16 : 8; - auto &weiDims = getInputShapeAtPort(WEI_ID).getDims(); + auto& weiDims = getInputShapeAtPort(WEI_ID).getDims(); if (weiDims[1] == Shape::UNDEFINED_DIM || weiDims[0] == Shape::UNDEFINED_DIM || // 1. strict fallback, until devising of multigroup handling in common case defConvAttr.group != 1 || // 2. common fallback, except specific n_group / n_channel combinations - (defConvAttr.group != 1 && ((weiDims[1] % simd_w != 0) // in_channels_per_gr !% simd_w - || ((weiDims[0] / defConvAttr.group) % simd_w != 0)))) { // out_channels_per_gr !% simd_w + (defConvAttr.group != 1 && + ((weiDims[1] % simd_w != 0) // in_channels_per_gr !% simd_w + || ((weiDims[0] / defConvAttr.group) % simd_w != 0)))) { // out_channels_per_gr !% simd_w enforceRef = true; } else { enforceRef = false; @@ -854,41 +879,48 @@ void DeformableConvolution::initSupportedPrimitiveDescriptors() { auto dataFormat = memory::format_tag::nhwc; auto offFormat = memory::format_tag::nchw; auto weiFormat = mayiuse(avx512_core) ? memory::format_tag::OIhw16i16o : memory::format_tag::OIhw8i8o; - config.inConfs[DATA_ID].setMemDesc(std::make_shared(getInputShapeAtPort(DATA_ID), - memory::data_type::f32, dataFormat)); - config.inConfs[OFF_ID].setMemDesc(std::make_shared(getInputShapeAtPort(OFF_ID), - memory::data_type::f32, offFormat)); + config.inConfs[DATA_ID].setMemDesc( + std::make_shared(getInputShapeAtPort(DATA_ID), memory::data_type::f32, dataFormat)); + config.inConfs[OFF_ID].setMemDesc( + std::make_shared(getInputShapeAtPort(OFF_ID), memory::data_type::f32, offFormat)); - config.inConfs[WEI_ID].setMemDesc(std::make_shared(getInputShapeAtPort(WEI_ID), - memory::data_type::f32, weiFormat)); + config.inConfs[WEI_ID].setMemDesc( + std::make_shared(getInputShapeAtPort(WEI_ID), memory::data_type::f32, weiFormat)); if (inputsNumber > 3) { config.inConfs[MOD_ID].setMemDesc(std::make_shared(getInputShapeAtPort(MOD_ID), - memory::data_type::f32, memory::format_tag::nchw)); + memory::data_type::f32, + memory::format_tag::nchw)); } - config.outConfs[0].setMemDesc(std::make_shared(getOutputShapeAtPort(DATA_ID), - memory::data_type::f32, dataFormat)); + config.outConfs[0].setMemDesc( + std::make_shared(getOutputShapeAtPort(DATA_ID), memory::data_type::f32, dataFormat)); supportedPrimitiveDescriptors.push_back({config, impl_type}); } else { // reference implementation - config.inConfs[DATA_ID].setMemDesc(std::make_shared(getInputShapeAtPort(DATA_ID), memory::data_type::f32, + config.inConfs[DATA_ID].setMemDesc(std::make_shared(getInputShapeAtPort(DATA_ID), + memory::data_type::f32, memory::format_tag::nchw)); - config.inConfs[OFF_ID].setMemDesc(std::make_shared(getInputShapeAtPort(OFF_ID), memory::data_type::f32, + config.inConfs[OFF_ID].setMemDesc(std::make_shared(getInputShapeAtPort(OFF_ID), + memory::data_type::f32, memory::format_tag::nchw)); - config.inConfs[WEI_ID].setMemDesc(std::make_shared(getInputShapeAtPort(WEI_ID), memory::data_type::f32, + config.inConfs[WEI_ID].setMemDesc(std::make_shared(getInputShapeAtPort(WEI_ID), + memory::data_type::f32, memory::format_tag::oihw)); if (inputsNumber > 3) { - config.inConfs[MOD_ID].setMemDesc(std::make_shared(getInputShapeAtPort(MOD_ID), memory::data_type::f32, + config.inConfs[MOD_ID].setMemDesc(std::make_shared(getInputShapeAtPort(MOD_ID), + memory::data_type::f32, memory::format_tag::nchw)); } - config.outConfs[0].setMemDesc(std::make_shared(getOutputShapeAtPort(DATA_ID), memory::data_type::f32, + config.outConfs[0].setMemDesc(std::make_shared(getOutputShapeAtPort(DATA_ID), + memory::data_type::f32, memory::format_tag::nchw)); supportedPrimitiveDescriptors.push_back({config, impl_type}); } } -void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( - const float* offsets, const float* modulation, bool enforceRef) { +void DeformableConvolution::DefConvExecutor::prepareSamplingWeights(const float* offsets, + const float* modulation, + bool enforceRef) { const int MB = jcp.mb; const int OH = jcp.oh; const int OW = jcp.ow; @@ -918,45 +950,45 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( const int h_in = oh * KSH - padT; const int w_in = ow * KSW - padL; - const float *data_offset_ptr = offsets + mb * offStrides[0] + (dg * 2 * KH * KW) * offStrides[1]; - const float *modulation_offset_ptr = nullptr; + const float* data_offset_ptr = offsets + mb * offStrides[0] + (dg * 2 * KH * KW) * offStrides[1]; + const float* modulation_offset_ptr = nullptr; if (modulation != nullptr) { modulation_offset_ptr = modulation + mb * modStrides[0] + (dg * ker_size) * modStrides[1]; } for (int kh = 0; kh < KH; kh++) { for (int kw = 0; kw < KW; kw++) { - const size_t data_offset_h_index = 2 * ((size_t) kh * KW + kw) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; - const size_t data_offset_w_index = (2 * ((size_t) kh * KW + kw) + 1) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; + const size_t data_offset_h_index = + 2 * ((size_t)kh * KW + kw) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; + const size_t data_offset_w_index = + (2 * ((size_t)kh * KW + kw) + 1) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; const float offset_h = data_offset_ptr[data_offset_h_index]; const float offset_w = data_offset_ptr[data_offset_w_index]; float map_h = h_in + kh * (KDH + 1) + offset_h; float map_w = w_in + kw * (KDW + 1) + offset_w; bool skip_compute; if (with_bi_pad) { - skip_compute = !(static_cast(map_w) > -1 && - static_cast(map_w) < IW && - static_cast(map_h) > -1 && - static_cast(map_h) < IH); + skip_compute = !(static_cast(map_w) > -1 && static_cast(map_w) < IW && + static_cast(map_h) > -1 && static_cast(map_h) < IH); } else { - skip_compute = !(map_w >= 0 && map_w < IW && - map_h >= 0 && map_h < IH); + skip_compute = !(map_w >= 0 && map_w < IW && map_h >= 0 && map_h < IH); } if (!skip_compute) { // modulations precomp. float modulation_scalar = 1.0f; if (modulation_offset_ptr != nullptr) { - size_t modulation_index = (kh * KW + kw) * modStrides[1] + oh * modStrides[2] + ow * modStrides[3]; + size_t modulation_index = + (kh * KW + kw) * modStrides[1] + oh * modStrides[2] + ow * modStrides[3]; modulation_scalar = modulation_offset_ptr[modulation_index]; } // interpolation precomp. const int cur_h_end = IH; const int cur_w_end = IW; - int h_low = with_bi_pad ? static_cast(floorf(map_h)) : - std::max(static_cast(floorf(map_h)), 0); - int w_low = with_bi_pad ? static_cast(floorf(map_w)) : - std::max(static_cast(floorf(map_w)), 0); + int h_low = + with_bi_pad ? static_cast(floorf(map_h)) : std::max(static_cast(floorf(map_h)), 0); + int w_low = + with_bi_pad ? static_cast(floorf(map_w)) : std::max(static_cast(floorf(map_w)), 0); int h_high = with_bi_pad ? h_low + 1 : std::min(static_cast(ceilf(map_h)), cur_h_end - 1); int w_high = with_bi_pad ? w_low + 1 : std::min(static_cast(ceilf(map_w)), cur_w_end - 1); @@ -976,7 +1008,7 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( const int h_off_low = h_ind_low * (srcStrides[2] / srcStrides[3]); const int h_off_high = h_ind_high * (srcStrides[2] / srcStrides[3]); - const int w_off_low = w_ind_low; + const int w_off_low = w_ind_low; const int w_off_high = w_ind_high; pSampledCoordsVector[sampledCoordIndex] = h_off_high + w_off_high; pSampledCoordsVector[sampledCoordIndex + 1] = h_off_high + w_off_low; @@ -984,7 +1016,7 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( pSampledCoordsVector[sampledCoordIndex + 3] = h_off_low + w_off_low; float w22 = hh * hw * modulation_scalar, w21 = hh * lw * modulation_scalar, - w12 = lh * hw * modulation_scalar, w11 = lh * lw * modulation_scalar; + w12 = lh * hw * modulation_scalar, w11 = lh * lw * modulation_scalar; pInterpWeightsVector[sampledCoordIndex] = w11; pInterpWeightsVector[sampledCoordIndex + 1] = w12; @@ -1007,15 +1039,16 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( }); } -DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector) { +DeformableConvolution::DefConvExecutor::DefConvExecutor( + const DefConvAttr& defConvAttr, + const std::vector>& descVector) { if (descVector.size() != 4 && descVector.size() != 5) { OPENVINO_THROW("Deformable Convolution executor got incorrect desc's count (", descVector.size(), ")"); } bool withModulation = descVector.size() == 5; - auto &srcDesc = descVector[DATA_ID]; - auto &dstDesc = descVector[descVector.size() - 1]; + auto& srcDesc = descVector[DATA_ID]; + auto& dstDesc = descVector[descVector.size() - 1]; srcStrides = std::vector(srcDesc->getStrides().size()); offStrides = descVector[OFF_ID]->getStrides(); weiStrides = descVector[WEI_ID]->getStrides(); @@ -1085,9 +1118,10 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defCo jcp.nthr = dnnl_get_max_threads(); } -DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector) : - DefConvExecutor(defConvAttr, descVector) { +DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor( + const DefConvAttr& defConvAttr, + const std::vector>& descVector) + : DefConvExecutor(defConvAttr, descVector) { #if defined(OPENVINO_ARCH_X86_64) if (mayiuse(cpu::x64::avx512_core)) { def_conv_kernel.reset(new jit_uni_def_conv_kernel_f32(jcp)); @@ -1106,9 +1140,13 @@ DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr #endif } -void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) { +void DeformableConvolution::DefConvRefExecutor::exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) { this->pSampledCoordsVector = pSampledCoordsVector; this->pInterpWeightsVector = pInterpWeightsVector; prepareSamplingWeights(offsets, modulation, true); @@ -1133,17 +1171,18 @@ void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const flo auto compKer = [OV_CAPTURE_CPY_AND_THIS](int g, int mb, int oc, int oh, int ow) { float d = 0; for (int ic = 0; ic < IC; ic++) { - const float *data_im_ptr = src + mb * srcStrides[0] + (g * IC + ic) * srcStrides[1]; + const float* data_im_ptr = src + mb * srcStrides[0] + (g * IC + ic) * srcStrides[1]; const int deformable_group_index = (IC * g + ic) / channel_per_deformable_group; - int sampledCoordIndex = (mb * DGHW + deformable_group_index * HW + oh * OW + ow) * ker_size * sampledPointsPerPixel; - size_t weiIndex = (size_t) g * group_wei_stride + oc * weiStrides[0] + ic * weiStrides[1]; + int sampledCoordIndex = + (mb * DGHW + deformable_group_index * HW + oh * OW + ow) * ker_size * sampledPointsPerPixel; + size_t weiIndex = (size_t)g * group_wei_stride + oc * weiStrides[0] + ic * weiStrides[1]; for (size_t kh_off = 0; kh_off < KH * weiStrides[2]; kh_off += weiStrides[2]) { for (size_t kw_off = 0; kw_off < KW * weiStrides[3]; kw_off += weiStrides[3]) { // check if current addendum marked as equal zero if (pSampledCoordsVector[sampledCoordIndex] != -1) { const int v11 = pSampledCoordsVector[sampledCoordIndex]; const int v12 = pSampledCoordsVector[sampledCoordIndex + 1]; - const int v21 = pSampledCoordsVector[sampledCoordIndex + 2]; + const int v21 = pSampledCoordsVector[sampledCoordIndex + 2]; const int v22 = pSampledCoordsVector[sampledCoordIndex + 3]; float val = 0; @@ -1174,8 +1213,9 @@ void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const flo }; parallel_nd(G, MB, OC, OH, OW, [&](dnnl_dim_t g, dnnl_dim_t mb, dnnl_dim_t oc, dnnl_dim_t oh, dnnl_dim_t ow) { - dst[mb * dstStrides[0] + (g * OC + oc) * dstStrides[1] + oh * dstStrides[2] + ow * dstStrides[3]] = compKer(g, mb, oc, oh, ow); - }); + dst[mb * dstStrides[0] + (g * OC + oc) * dstStrides[1] + oh * dstStrides[2] + ow * dstStrides[3]] = + compKer(g, mb, oc, oh, ow); + }); } void DeformableConvolution::prepareParams() { @@ -1208,22 +1248,17 @@ void DeformableConvolution::prepareParams() { updatePadding(); - std::vector> descVector { + std::vector> descVector{ getParentEdgeAt(DATA_ID)->getMemory().getDescWithType(), getParentEdgeAt(OFF_ID)->getMemory().getDescWithType(), - getParentEdgeAt(WEI_ID)->getMemory().getDescWithType() - }; + getParentEdgeAt(WEI_ID)->getMemory().getDescWithType()}; if (withModulation) { descVector.push_back(getParentEdgeAt(MOD_ID)->getMemory().getDescWithType()); } descVector.push_back(getChildEdgeAt(0)->getMemory().getDescWithType()); - DefConvKey key = { - descVector, - defConvAttr, - getSelectedPrimitiveDescriptor()->getImplementationType() - }; + DefConvKey key = {descVector, defConvAttr, getSelectedPrimitiveDescriptor()->getImplementationType()}; const int MB = getParentEdgeAt(DATA_ID)->getMemory().getStaticDims()[0]; const int OH = getChildEdgeAt(0)->getMemory().getStaticDims()[2]; @@ -1241,7 +1276,7 @@ void DeformableConvolution::prepareParams() { execPtr = nullptr; auto cache = context->getParamsCache(); - auto result = cache->getOrCreate(key, [] (const DefConvKey& key) -> std::shared_ptr { + auto result = cache->getOrCreate(key, [](const DefConvKey& key) -> std::shared_ptr { if (key.implType == impl_desc_type::ref) { return std::make_shared(key.defConvAttr, key.descVector); } @@ -1258,9 +1293,13 @@ void DeformableConvolution::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -void DeformableConvolution::DefConvJitExecutor::exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) { +void DeformableConvolution::DefConvJitExecutor::exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) { this->pSampledCoordsVector = pSampledCoordsVector; this->pInterpWeightsVector = pInterpWeightsVector; prepareSamplingWeights(offsets, modulation, false); @@ -1276,9 +1315,11 @@ void DeformableConvolution::DefConvJitExecutor::exec(const float* src, const flo const size_t _oc = g * jcp.nb_oc; const size_t _ic = g * jcp.nb_ic; - par_conv.src = &src[n * srcStrides[0] + _ic*jcp.ic_block * srcStrides[1]]; - par_conv.sampledWei = &(pInterpWeightsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); - par_conv.sampledCoords = &(pSampledCoordsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); + par_conv.src = &src[n * srcStrides[0] + _ic * jcp.ic_block * srcStrides[1]]; + par_conv.sampledWei = + &(pInterpWeightsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); + par_conv.sampledCoords = + &(pSampledCoordsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); par_conv.filt = &weights[g * jcp.nb_oc * jcp.nb_ic * jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block]; par_conv.dst = &dst[n * dstStrides[0] + _oc * jcp.oc_block * dstStrides[1] + oh * dstStrides[2]]; par_conv.buf = input_buffer_ptr + ithr * jcp.ur_w * jcp.kh * jcp.kw * jcp.ic; @@ -1292,20 +1333,20 @@ void DeformableConvolution::DefConvJitExecutor::exec(const float* src, const flo void DeformableConvolution::execute(dnnl::stream strm) { const size_t inputsNumber = getOriginalInputsNumber(); - auto &srcMemory0 = getParentEdgeAt(0)->getMemory(); - auto &srcMemory1 = getParentEdgeAt(1)->getMemory(); - auto &srcMemory2 = getParentEdgeAt(2)->getMemory(); - auto &dstMemory = getChildEdgeAt(0)->getMemory(); + auto& srcMemory0 = getParentEdgeAt(0)->getMemory(); + auto& srcMemory1 = getParentEdgeAt(1)->getMemory(); + auto& srcMemory2 = getParentEdgeAt(2)->getMemory(); + auto& dstMemory = getChildEdgeAt(0)->getMemory(); - const auto *src = srcMemory0.getDataAs(); - const auto *offsets = srcMemory1.getDataAs(); - const auto *weights = srcMemory2.getDataAs(); + const auto* src = srcMemory0.getDataAs(); + const auto* offsets = srcMemory1.getDataAs(); + const auto* weights = srcMemory2.getDataAs(); float* modulation = nullptr; if (inputsNumber > 3) { modulation = getSrcDataAtPortAs(3); } - float *dst = dstMemory.getDataAs(); + float* dst = dstMemory.getDataAs(); auto selectedPrimitiveDescriptor = getSelectedPrimitiveDescriptor(); if (!selectedPrimitiveDescriptor) @@ -1333,6 +1374,6 @@ ov::element::Type DeformableConvolution::getRuntimePrecision() const { return getMaxPrecision(getInputPrecisions()); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.h b/src/plugins/intel_cpu/src/nodes/def_conv.h index 127fd00eb2bf00..ed5800a19a0e84 100644 --- a/src/plugins/intel_cpu/src/nodes/def_conv.h +++ b/src/plugins/intel_cpu/src/nodes/def_conv.h @@ -5,6 +5,7 @@ #pragma once #include + #include #include #include @@ -43,20 +44,20 @@ struct jit_def_conv_params { }; struct jit_def_conv_call_args { - const void *src; - const void *sampledWei; - const void *sampledCoords; - const void *filt; - const void *bias; - const void *dst; - const void *buf; + const void* src; + const void* sampledWei; + const void* sampledCoords; + const void* filt; + const void* bias; + const void* dst; + const void* buf; size_t oh_pos; }; struct jit_uni_def_conv_kernel { - void (*ker_)(const jit_def_conv_call_args *); + void (*ker_)(const jit_def_conv_call_args*); - void operator()(const jit_def_conv_call_args *args) { + void operator()(const jit_def_conv_call_args* args) { assert(ker_); ker_(args); } @@ -109,53 +110,66 @@ class DeformableConvolution : public Node { static constexpr size_t MOD_ID = 3; std::string errorPrefix; class DefConvExecutor { - public: - DefConvExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector); - - virtual void exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) = 0; - virtual ~DefConvExecutor() = default; - - protected: - void prepareSamplingWeights(const float* offsets, const float* modulation = nullptr, bool enforceRef = false); - jit_def_conv_params jcp = {}; - VectorDims srcStrides; - VectorDims offStrides; - VectorDims weiStrides; - VectorDims modStrides; - VectorDims dstStrides; - int *pSampledCoordsVector; - float *pInterpWeightsVector; + public: + DefConvExecutor(const DefConvAttr& defConvAttr, + const std::vector>& descVector); + + virtual void exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) = 0; + virtual ~DefConvExecutor() = default; + + protected: + void prepareSamplingWeights(const float* offsets, const float* modulation = nullptr, bool enforceRef = false); + jit_def_conv_params jcp = {}; + VectorDims srcStrides; + VectorDims offStrides; + VectorDims weiStrides; + VectorDims modStrides; + VectorDims dstStrides; + int* pSampledCoordsVector; + float* pInterpWeightsVector; }; class DefConvRefExecutor : public DefConvExecutor { - public: - DefConvRefExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector) : - DefConvExecutor(defConvAttr, descVector) {} - - void exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) override; + public: + DefConvRefExecutor(const DefConvAttr& defConvAttr, + const std::vector>& descVector) + : DefConvExecutor(defConvAttr, descVector) {} + + void exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) override; }; class DefConvJitExecutor : public DefConvExecutor { - std::shared_ptr def_conv_kernel = nullptr; - public: - DefConvJitExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector); - - void exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) override; + std::shared_ptr def_conv_kernel = nullptr; + + public: + DefConvJitExecutor(const DefConvAttr& defConvAttr, + const std::vector>& descVector); + + void exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) override; }; std::shared_ptr execPtr = nullptr; bool autoPadding = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp b/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp index eb3789068adca1..a8629ce2592d76 100644 --- a/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp +++ b/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp @@ -4,16 +4,15 @@ #include "depth_to_space.h" -#include "dnnl_extension_utils.h" -#include "utils/general_utils.h" - #include -#include "common/primitive_hashing_utils.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "openvino/opsets/opset1.hpp" #include #include "common/blocked_desc_creator.h" +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "dnnl_extension_utils.h" +#include "openvino/opsets/opset1.hpp" +#include "utils/general_utils.h" #define THROW_ERROR(...) OPENVINO_THROW("DepthToSpace layer with name '", getName(), "' ", __VA_ARGS__) @@ -40,9 +39,8 @@ size_t DepthToSpace::DepthToSpaceAttrs::hash() const { } bool DepthToSpace::DepthToSpaceAttrs::operator==(const DepthToSpaceAttrs& rhs) const { - bool result = layoutType == rhs.layoutType && mode == rhs.mode && - blockSize == rhs.blockSize && blockStep == rhs.blockStep && - dataSize == rhs.dataSize && nSpatialDims == rhs.nSpatialDims && + bool result = layoutType == rhs.layoutType && mode == rhs.mode && blockSize == rhs.blockSize && + blockStep == rhs.blockStep && dataSize == rhs.dataSize && nSpatialDims == rhs.nSpatialDims && srcBlockedDims == rhs.srcBlockedDims; return result; @@ -56,7 +54,9 @@ bool DepthToSpace::isSupportedOperation(const std::shared_ptr& o return false; } const auto mode = depthToSpace->get_mode(); - if (!one_of(mode, ov::op::v0::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, ov::op::v0::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST)) { + if (!one_of(mode, + ov::op::v0::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, + ov::op::v0::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST)) { errorMessage = "Does not support mode: " + ov::as_string(mode); return false; } @@ -138,7 +138,8 @@ void DepthToSpace::initSupportedPrimitiveDescriptors() { if (inputDataShape.getRank() > 2) { const auto& srcDims = inputDataShape.getDims(); auto canUseBlocked = [OV_CAPTURE_CPY_AND_THIS](const size_t block) { - return srcDims[1] != Shape::UNDEFINED_DIM && srcDims[1] % block == 0 && (srcDims[1] / block) % attrs.blockStep == 0 && + return srcDims[1] != Shape::UNDEFINED_DIM && srcDims[1] % block == 0 && + (srcDims[1] / block) % attrs.blockStep == 0 && (attrs.mode == Mode::DEPTH_FIRST ? block % attrs.blockStep == 0 : true); }; @@ -172,9 +173,10 @@ void DepthToSpace::createPrimitive() { const auto& memoryDesc = srcMemPtr->getDesc(); attrs.dataSize = memoryDesc.getPrecision().size(); attrs.nSpatialDims = memoryDesc.getShape().getRank() - 2; - attrs.layoutType = memoryDesc.hasLayoutType(LayoutType::nCsp16c) ? LayoutType::nCsp16c : - memoryDesc.hasLayoutType(LayoutType::nCsp8c) ? LayoutType::nCsp8c : - memoryDesc.hasLayoutType(LayoutType::nspc) ? LayoutType::nspc : LayoutType::ncsp; + attrs.layoutType = memoryDesc.hasLayoutType(LayoutType::nCsp16c) ? LayoutType::nCsp16c + : memoryDesc.hasLayoutType(LayoutType::nCsp8c) ? LayoutType::nCsp8c + : memoryDesc.hasLayoutType(LayoutType::nspc) ? LayoutType::nspc + : LayoutType::ncsp; if (inputShapesDefined()) { if (needPrepareParams()) @@ -205,7 +207,8 @@ DepthToSpace::DepthToSpaceExecutor::DepthToSpaceExecutor(const DepthToSpaceAttrs const bool isBlocked = one_of(attrs.layoutType, LayoutType::nCsp16c, LayoutType::nCsp8c); const bool isChannelsFirst = attrs.layoutType == LayoutType::nspc; const size_t nDims = attrs.srcBlockedDims.size(); - const size_t reshapedRank = nDims + attrs.nSpatialDims + static_cast(isBlocked && attrs.mode == Mode::DEPTH_FIRST); + const size_t reshapedRank = + nDims + attrs.nSpatialDims + static_cast(isBlocked && attrs.mode == Mode::DEPTH_FIRST); const size_t lastIdx = reshapedRank - 1; size_t firstSpatialOrder = 2; @@ -219,21 +222,24 @@ DepthToSpace::DepthToSpaceExecutor::DepthToSpaceExecutor(const DepthToSpaceAttrs params.src_block_dims[0] = attrs.srcBlockedDims[0]; // reshaping of src dimensions and creating the permutation order for each layout: - // new shape: mode = blocks_first [N, block_size, block_size, ..., block_size, C / (block_size ^ K), D1, D2, ..., DK] - // mode = depth_first [N, C / (block_size ^ K), block_size, block_size, ..., block_size, D1, D2, ..., DK] + // new shape: mode = blocks_first [N, block_size, block_size, ..., block_size, C / (block_size ^ K), D1, D2, ..., + // DK] + // mode = depth_first [N, C / (block_size ^ K), block_size, block_size, ..., block_size, D1, D2, ..., + // DK] // order : mode = blocks_first : [0, K + 1, K + 2, 1, K + 3, 2, K + 4, 3, ..., K + (K + 1), K] // mode = depth_first : [0, 1, K + 2, 2, K + 3, 3, K + 4, 4, ..., K + (K + 1), K + 1] // where `k` is number of spatial dimensions - auto reshapeAndSetPermOrder = [&](const size_t idx1, const size_t idx2, const size_t shift, const VectorDims& dims) { - for (size_t i = 0; i < attrs.nSpatialDims; i++) { - params.order[i * 2 + shift] = i + idx1; - params.order[i * 2 + shift + 1] = i + idx2; + auto reshapeAndSetPermOrder = + [&](const size_t idx1, const size_t idx2, const size_t shift, const VectorDims& dims) { + for (size_t i = 0; i < attrs.nSpatialDims; i++) { + params.order[i * 2 + shift] = i + idx1; + params.order[i * 2 + shift + 1] = i + idx2; - params.src_block_dims[params.order[i * 2 + shift]] = dims[i + shift]; - params.src_block_dims[params.order[i * 2 + shift + 1]] = attrs.blockSize; - } - }; + params.src_block_dims[params.order[i * 2 + shift]] = dims[i + shift]; + params.src_block_dims[params.order[i * 2 + shift + 1]] = attrs.blockSize; + } + }; if (isBlocked) { size_t orderShiftForBlocks, orderShiftForDims; @@ -314,6 +320,6 @@ bool DepthToSpace::created() const { return getType() == Type::DepthToSpace; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/depth_to_space.h b/src/plugins/intel_cpu/src/nodes/depth_to_space.h index 2eda39f60394af..21eca73f97318c 100644 --- a/src/plugins/intel_cpu/src/nodes/depth_to_space.h +++ b/src/plugins/intel_cpu/src/nodes/depth_to_space.h @@ -54,6 +54,6 @@ class DepthToSpace : public Node { executorPtr execPtr = nullptr; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/detection_output.cpp b/src/plugins/intel_cpu/src/nodes/detection_output.cpp index 99702780b83034..9cf52e7042c6ba 100644 --- a/src/plugins/intel_cpu/src/nodes/detection_output.cpp +++ b/src/plugins/intel_cpu/src/nodes/detection_output.cpp @@ -16,8 +16,7 @@ namespace node { namespace { template -bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { +bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2) { return (pair1.first > pair2.first) || (pair1.first == pair2.first && pair1.second < pair2.second); } @@ -27,9 +26,10 @@ bool SortScorePairDescend>(const std::pair pair2.first) || (pair1.first == pair2.first && pair1.second.second < pair2.second.second); } -} // namespace +} // namespace -bool DetectionOutput::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool DetectionOutput::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto doOp = ov::as_type_ptr(op); if (!doOp) { @@ -58,7 +58,7 @@ DetectionOutput::DetectionOutput(const std::shared_ptr& op, const Grap errorPrefix = "DetectionOutput node with name '" + getName() + "' "; if (getOriginalInputsNumber() != 3 && getOriginalInputsNumber() != 5) - OPENVINO_THROW(errorPrefix, "has incorrect number of input edges."); + OPENVINO_THROW(errorPrefix, "has incorrect number of input edges."); if (getOriginalOutputsNumber() != 1) OPENVINO_THROW(errorPrefix, "has incorrect number of output edges."); @@ -93,7 +93,7 @@ DetectionOutput::DetectionOutput(const std::shared_ptr& op, const Grap void DetectionOutput::prepareParams() { const auto& idPriorDims = getParentEdgeAt(ID_PRIOR)->getMemory().getShape().getStaticDims(); - const auto &idConfDims = getParentEdgeAt(ID_CONF)->getMemory().getShape().getStaticDims(); + const auto& idConfDims = getParentEdgeAt(ID_CONF)->getMemory().getShape().getStaticDims(); priorsNum = static_cast(idPriorDims.back() / priorSize); isPriorsPerImg = idPriorDims.front() != 1; classesNum = static_cast(idConfDims.back() / priorsNum); @@ -130,9 +130,8 @@ void DetectionOutput::prepareParams() { // --> g_topk(vector<>(all detections) --> indices per class)) // MXNet: max conf for prior within img, filter(indices) --> topk_img(buffer) --> nms_cls(indices) // --> g_topk(vector<>(all detections) --> indices per class)) - isSparsityWorthwhile = - (confidenceThreshold > sparsityThreshold) && - ((classesNum * priorsNum * sizeof(float) * 2) > static_cast(cacheSizeL3)); + isSparsityWorthwhile = (confidenceThreshold > sparsityThreshold) && + ((classesNum * priorsNum * sizeof(float) * 2) > static_cast(cacheSizeL3)); confInfoLen = (!decreaseClassId && isSparsityWorthwhile) ? (2 * priorsNum + 1) : priorsNum; reorderedConf.resize(imgNum * classesNum * confInfoLen); @@ -149,17 +148,17 @@ void DetectionOutput::initSupportedPrimitiveDescriptors() { for (size_t i = 0; i < inputShapes.size(); ++i) inDataConf.emplace_back(LayoutType::ncsp, ov::element::f32); - addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::f32}}, - impl_desc_type::ref_any); + addSupportedPrimDesc(inDataConf, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } struct ConfidenceComparatorDO { explicit ConfidenceComparatorDO(const float* confDataIn) : confData(confDataIn) {} bool operator()(int idx1, int idx2) { - if (confData[idx1] > confData[idx2]) return true; - if (confData[idx1] < confData[idx2]) return false; + if (confData[idx1] > confData[idx2]) + return true; + if (confData[idx1] < confData[idx2]) + return false; return idx1 < idx2; } @@ -171,31 +170,29 @@ void DetectionOutput::executeDynamicImpl(dnnl::stream strm) { } void DetectionOutput::execute(dnnl::stream strm) { - float *dstData = getDstDataAtPortAs(0); + float* dstData = getDstDataAtPortAs(0); - const float *locData = getSrcDataAtPortAs(ID_LOC); - const float *confData = getSrcDataAtPortAs(ID_CONF); - const float *priorData = getSrcDataAtPortAs(ID_PRIOR); - const float *ARMConfData = inputShapes.size() > 3 ? - getSrcDataAtPortAs(ID_ARM_CONF) : nullptr; - const float *ARMLocData = inputShapes.size() > 4 ? - getSrcDataAtPortAs(ID_ARM_LOC) : nullptr; + const float* locData = getSrcDataAtPortAs(ID_LOC); + const float* confData = getSrcDataAtPortAs(ID_CONF); + const float* priorData = getSrcDataAtPortAs(ID_PRIOR); + const float* ARMConfData = inputShapes.size() > 3 ? getSrcDataAtPortAs(ID_ARM_CONF) : nullptr; + const float* ARMLocData = inputShapes.size() > 4 ? getSrcDataAtPortAs(ID_ARM_LOC) : nullptr; - float *reorderedConfData = reorderedConf.data(); - int *reorderedConfDataIndices = reinterpret_cast(reorderedConf.data()); + float* reorderedConfData = reorderedConf.data(); + int* reorderedConfDataIndices = reinterpret_cast(reorderedConf.data()); - float *decodedBboxesData = decodedBboxes.data(); - float *bboxSizesData = bboxSizes.data(); - int *indicesData = indices.data(); - int *indicesBufData = indicesBuffer.data(); - int *detectionsData = detectionsCount.data(); + float* decodedBboxesData = decodedBboxes.data(); + float* bboxSizesData = bboxSizes.data(); + int* indicesData = indices.data(); + int* indicesBufData = indicesBuffer.data(); + int* detectionsData = detectionsCount.data(); memset(detectionsData, 0, imgNum * classesNum * sizeof(int)); int priorsBatch = isPriorsPerImg ? imgNum : 1; - int *numPriorsActualdata = numPriorsActual.data(); + int* numPriorsActualdata = numPriorsActual.data(); for (int n = 0; n < priorsBatch; ++n) { - const float *ppriors = priorData; + const float* ppriors = priorData; ppriors += varianceEncodedInTarget ? (n * priorsNum * priorSize) : (2 * n * priorsNum * priorSize); getActualPriorNum(ppriors, numPriorsActualdata, n); } @@ -204,21 +201,32 @@ void DetectionOutput::execute(dnnl::stream strm) { if (!isSparsityWorthwhile) { confReorderDense(confData, ARMConfData, reorderedConfData); - } else { // sparsity + } else { // sparsity if (!decreaseClassId) { - confReorderAndFilterSparsityCF(confData, ARMConfData, reorderedConfData, indicesData, indicesBufData, detectionsData); + confReorderAndFilterSparsityCF(confData, + ARMConfData, + reorderedConfData, + indicesData, + indicesBufData, + detectionsData); } else { - confReorderAndFilterSparsityMX(confData, ARMConfData, reorderedConfData, indicesData, indicesBufData, detectionsData); + confReorderAndFilterSparsityMX(confData, + ARMConfData, + reorderedConfData, + indicesData, + indicesBufData, + detectionsData); } } - int *confInfoV = confInfoForPrior.data(); + int* confInfoV = confInfoForPrior.data(); for (int n = 0; n < imgNum; ++n) { - const float *ppriors = priorData; - const float *priorVariances = priorData + priorsNum * priorSize; + const float* ppriors = priorData; + const float* priorVariances = priorData + priorsNum * priorSize; if (isPriorsPerImg) { - int priorSizePerImg = varianceEncodedInTarget ? (n * priorsNum * priorSize) : (2 * n * priorsNum * priorSize); + int priorSizePerImg = + varianceEncodedInTarget ? (n * priorsNum * priorSize) : (2 * n * priorsNum * priorSize); ppriors += priorSizePerImg; priorVariances += varianceEncodedInTarget ? 0 : priorSizePerImg; } @@ -226,17 +234,50 @@ void DetectionOutput::execute(dnnl::stream strm) { if (isShareLoc) { int locShift = n * priorsNum; int coordShift = locShift * 4; - const float *ploc = locData + coordShift; - float *pboxes = decodedBboxesData + coordShift; - float *psizes = bboxSizesData + locShift; - int *confInfoVB = confInfoV + locShift; + const float* ploc = locData + coordShift; + float* pboxes = decodedBboxesData + coordShift; + float* psizes = bboxSizesData + locShift; + int* confInfoVB = confInfoV + locShift; if (withAddBoxPred) { - const float *pARMLoc = ARMLocData + coordShift; - decodeBBoxes(ppriors, pARMLoc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, nullptr, confInfoVB); - decodeBBoxes(pboxes, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, 0, 4, false, nullptr, confInfoVB); + const float* pARMLoc = ARMLocData + coordShift; + decodeBBoxes(ppriors, + pARMLoc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + nullptr, + confInfoVB); + decodeBBoxes(pboxes, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + 0, + 4, + false, + nullptr, + confInfoVB); } else { - decodeBBoxes(ppriors, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, nullptr, confInfoVB); + decodeBBoxes(ppriors, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + nullptr, + confInfoVB); } } else { for (int c = 0; c < locNumForClasses; ++c) { @@ -245,16 +286,46 @@ void DetectionOutput::execute(dnnl::stream strm) { } int locShift = n * priorsNum * locNumForClasses; int coordShift = locShift * 4; - const float *ploc = locData + coordShift + c * 4; - float *pboxes = decodedBboxesData + coordShift + c * 4 * priorsNum; - float *psizes = bboxSizesData + locShift + c * priorsNum; - int *confInfoHBC = reorderedConfDataIndices + n * confInfoLen * classesNum + c*confInfoLen; + const float* ploc = locData + coordShift + c * 4; + float* pboxes = decodedBboxesData + coordShift + c * 4 * priorsNum; + float* psizes = bboxSizesData + locShift + c * priorsNum; + int* confInfoHBC = reorderedConfDataIndices + n * confInfoLen * classesNum + c * confInfoLen; if (withAddBoxPred) { - const float *pARMLoc = ARMLocData + n * 4 * locNumForClasses * priorsNum + c * 4; - decodeBBoxes(ppriors, pARMLoc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, confInfoHBC); - decodeBBoxes(pboxes, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, 0, 4, false, confInfoHBC); + const float* pARMLoc = ARMLocData + n * 4 * locNumForClasses * priorsNum + c * 4; + decodeBBoxes(ppriors, + pARMLoc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + confInfoHBC); + decodeBBoxes(pboxes, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + 0, + 4, + false, + confInfoHBC); } else { - decodeBBoxes(ppriors, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, confInfoHBC); + decodeBBoxes(ppriors, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + confInfoHBC); } } } @@ -267,16 +338,16 @@ void DetectionOutput::execute(dnnl::stream strm) { parallel_for(classesNum, [&](int c) { if (c != backgroundClassId) { // Ignore background class const int off = n * priorsNum * classesNum + c * priorsNum; - const float *pconfReorder = reorderedConfData + off; - int *pindices = indicesData + off; - int *pbuffer = indicesBufData + off; - int *pdetections = detectionsData + n * classesNum + c; + const float* pconfReorder = reorderedConfData + off; + int* pindices = indicesData + off; + int* pbuffer = indicesBufData + off; + int* pdetections = detectionsData + n * classesNum + c; if (!isSparsityWorthwhile) confFilterCF(pconfReorder, pindices, pbuffer, pdetections, n); - const float *pboxes; - const float *psizes; + const float* pboxes; + const float* psizes; if (isShareLoc) { pboxes = decodedBboxesData + n * 4 * priorsNum; psizes = bboxSizesData + n * priorsNum; @@ -291,23 +362,23 @@ void DetectionOutput::execute(dnnl::stream strm) { } else { // MXNet style const int offImg = n * priorsNum * classesNum; - const float *pconf = confData + offImg; - float *pconfReorder = reorderedConfData + offImg; - int *pbuffer = indicesBufData + offImg; - int *pindices = indicesData + offImg; - int *pdetections = detectionsData + n * classesNum; + const float* pconf = confData + offImg; + float* pconfReorder = reorderedConfData + offImg; + int* pbuffer = indicesBufData + offImg; + int* pindices = indicesData + offImg; + int* pdetections = detectionsData + n * classesNum; if (!isSparsityWorthwhile) confFilterMX(pconf, ARMConfData, pconfReorder, pindices, pbuffer, pdetections, n); - const float *pboxes = decodedBboxesData + n * 4 * locNumForClasses * priorsNum; - const float *psizes = bboxSizesData + n * locNumForClasses * priorsNum; + const float* pboxes = decodedBboxesData + n * 4 * locNumForClasses * priorsNum; + const float* psizes = bboxSizesData + n * locNumForClasses * priorsNum; NMSMX(pbuffer, pdetections, pindices, pboxes, psizes); } int detectionsTotal = 0; - detectionsTotal = parallel_sum(classesNum, detectionsTotal, [&](size_t c)->int { + detectionsTotal = parallel_sum(classesNum, detectionsTotal, [&](size_t c) -> int { return detectionsData[n * classesNum + c]; }); @@ -318,9 +389,9 @@ void DetectionOutput::execute(dnnl::stream strm) { std::mutex mtx; parallel_for(classesNum, [&](int c) { const int detections = detectionsData[n * classesNum + c]; - int *pindices = indicesData + n * classesNum * priorsNum + c * priorsNum; + int* pindices = indicesData + n * classesNum * priorsNum + c * priorsNum; - float *pconf = reorderedConfData + n * classesNum * confInfoLen + c * confInfoLen; + float* pconf = reorderedConfData + n * classesNum * confInfoLen + c * confInfoLen; for (int i = 0; i < detections; ++i) { int pr = pindices[i]; @@ -330,7 +401,8 @@ void DetectionOutput::execute(dnnl::stream strm) { } }); - std::sort(confIndicesClassMap.begin(), confIndicesClassMap.end(), + std::sort(confIndicesClassMap.begin(), + confIndicesClassMap.end(), SortScorePairDescend>); confIndicesClassMap.resize(keepTopK); @@ -340,7 +412,7 @@ void DetectionOutput::execute(dnnl::stream strm) { for (size_t j = 0; j < confIndicesClassMap.size(); ++j) { const int cls = confIndicesClassMap[j].second.first; const int pr = confIndicesClassMap[j].second.second; - int *pindices = indicesData + n * classesNum * priorsNum + cls * priorsNum; + int* pindices = indicesData + n * classesNum * priorsNum + cls * priorsNum; pindices[detectionsData[n * classesNum + cls]] = pr; detectionsData[n * classesNum + cls]++; } @@ -351,7 +423,11 @@ void DetectionOutput::execute(dnnl::stream strm) { generateOutput(reorderedConfData, indicesData, detectionsData, decodedBboxesData, dstData); } -inline void DetectionOutput::confFilterCF(const float* pconf, int* pindices, int* pbuffer, int* detectionsData, const int& n) { +inline void DetectionOutput::confFilterCF(const float* pconf, + int* pindices, + int* pbuffer, + int* detectionsData, + const int& n) { // in: reorderedConf // out: pindices count int count = 0; @@ -371,21 +447,27 @@ inline void DetectionOutput::confFilterCF(const float* pconf, int* pindices, int // MX filter is per image filter, max output is prior num(select max for all class within this prior) // NMS is per class, keep topk is per image, final output is per class -inline void DetectionOutput::confFilterMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData, const int& n) { +inline void DetectionOutput::confFilterMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData, + const int& n) { std::mutex mtx; parallel_for(numPriorsActual[n], [&](size_t p) { // in: origin conf // out: pindices, detectionCount // intentionally code branch from higher level if (withAddBoxPred) { - const bool isARMPrior = ARMConfData[n*priorsNum*2 + p * 2 + 1] < objScore; + const bool isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore; float maxConf = -1; int maxCIdx = 0; for (int c = 1; c < classesNum; ++c) { float conf = confData[p * classesNum + c]; if (isARMPrior) - conf = (c == backgroundClassId) ? 1.0f : 0.0f; // still need refresh conf due to read from origin conf + conf = + (c == backgroundClassId) ? 1.0f : 0.0f; // still need refresh conf due to read from origin conf if (conf >= confidenceThreshold && conf > maxConf) { maxConf = conf; maxCIdx = c; @@ -394,7 +476,7 @@ inline void DetectionOutput::confFilterMX(const float* confData, const float* AR if (maxCIdx > 0) { // include this prior mtx.lock(); - indicesData[detectionsData[0]] = maxCIdx*priorsNum + p; // de-refer to get prior and class id. + indicesData[detectionsData[0]] = maxCIdx * priorsNum + p; // de-refer to get prior and class id. detectionsData[0]++; mtx.unlock(); } @@ -411,7 +493,7 @@ inline void DetectionOutput::confFilterMX(const float* confData, const float* AR if (maxCIdx > 0) { // include this prior and class with max conf mtx.lock(); - indicesData[detectionsData[0]] = maxCIdx*priorsNum + p; // de-refer to get prior and class id. + indicesData[detectionsData[0]] = maxCIdx * priorsNum + p; // de-refer to get prior and class id. detectionsData[0]++; mtx.unlock(); } @@ -423,14 +505,14 @@ inline void DetectionOutput::confFilterMX(const float* confData, const float* AR int count = detectionsData[0]; int k = (topK == -1 ? count : (std::min)(topK, count)); - const float *pconf = reorderedConfData; + const float* pconf = reorderedConfData; // int *indices = indicesData; // int *pbuffer = indicesBufData; topk(indicesData, indicesBufData, pconf, count, k); detectionsData[0] = k; } -inline void DetectionOutput::getActualPriorNum(const float *priorData, int* numPriorsActual, int n) { +inline void DetectionOutput::getActualPriorNum(const float* priorData, int* numPriorsActual, int n) { numPriorsActual[n] = priorsNum; if (!normalized) { int num = 0; @@ -444,16 +526,20 @@ inline void DetectionOutput::getActualPriorNum(const float *priorData, int* numP } } -inline void DetectionOutput::confReorderDense(const float *confData, const float *ARMConfData, float *reorderedConfData) { +inline void DetectionOutput::confReorderDense(const float* confData, + const float* ARMConfData, + float* reorderedConfData) { if (withAddBoxPred) { parallel_for2d(imgNum, priorsNum, [&](size_t n, size_t p) { if (ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore) { for (int c = 0; c < classesNum; ++c) { - reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = c == backgroundClassId ? 1.0f : 0.0f; + reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = + c == backgroundClassId ? 1.0f : 0.0f; } } else { for (int c = 0; c < classesNum; ++c) { - reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = confData[n * priorsNum * classesNum + p * classesNum + c]; + reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = + confData[n * priorsNum * classesNum + p * classesNum + c]; } } }); @@ -463,20 +549,23 @@ inline void DetectionOutput::confReorderDense(const float *confData, const float parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) { const int offset = n * priorsNum * classesNum; for (int p = 0; p < priorsNum; ++p) { - reorderedConfData[offset + c * priorsNum + p] = - confData[offset + p * classesNum + c]; + reorderedConfData[offset + c * priorsNum + p] = confData[offset + p * classesNum + c]; } }); } -inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData) { +inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData) { int* reorderedConfDataIndices = reinterpret_cast(reorderedConfData); for (int n = 0; n < imgNum; ++n) { const int off = n * priorsNum * classesNum; const int offV = n * priorsNum; // vertical info - const int offH = n * confInfoLen * classesNum; // horizontal info + const int offH = n * confInfoLen * classesNum; // horizontal info // reset count parallel_for(classesNum, [&](size_t c) { const int countIdx = offH + c * confInfoLen + priorsNum; @@ -506,7 +595,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat // vertical info for isShareLoc(flag to decode for each prior) if (!priorStatusSet && isShareLoc) { - confInfoForPrior[offV + p] = 1; // 1 for decode + confInfoForPrior[offV + p] = 1; // 1 for decode } } } @@ -542,9 +631,9 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat const int count = reorderedConfDataIndices[countIdx]; const int k = (topK == -1 ? count : (std::min)(topK, count)); - int *reorderedConfIndices = reorderedConfDataIndices + countIdx + 1; - int *pbuffer = indicesBufData + off + c * priorsNum; - const float *pconf = reorderedConfData + offH + c * confInfoLen; + int* reorderedConfIndices = reorderedConfDataIndices + countIdx + 1; + int* pbuffer = indicesBufData + off + c * priorsNum; + const float* pconf = reorderedConfData + offH + c * confInfoLen; topk(reorderedConfIndices, pbuffer, pconf, count, k); detectionsData[n * classesNum + c] = k; @@ -552,8 +641,12 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat } } -inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData) { +inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData) { for (int n = 0; n < imgNum; ++n) { const int off = n * priorsNum * classesNum; const int offV = n * priorsNum; // vertical info @@ -579,7 +672,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat // vertical info for isShareLoc(flag to decode for each prior) if (!priorStatusSet && isShareLoc) { - confInfoForPrior[offV + p] = 1; // 1 for decode + confInfoForPrior[offV + p] = 1; // 1 for decode } // vertical info for MXNet style(max conf for each prior) if (c != 0) { @@ -593,7 +686,8 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat // MXNet statistic, indices and detectionCount is for each image if (maxCIdx > 0) { mtx.lock(); - indicesData[off + detectionsData[n * classesNum]] = maxCIdx * priorsNum + p; // de-refer to get prior and class id. + indicesData[off + detectionsData[n * classesNum]] = + maxCIdx * priorsNum + p; // de-refer to get prior and class id. detectionsData[n * classesNum]++; mtx.unlock(); } @@ -604,27 +698,27 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat const int count = detectionsData[n * classesNum]; const int k = (topK == -1 ? count : (std::min)(topK, count)); - const float *pconf = reorderedConfData + off; - int *indices = indicesData + off; - int *pbuffer = indicesBufData + off; + const float* pconf = reorderedConfData + off; + int* indices = indicesData + off; + int* pbuffer = indicesBufData + off; topk(indices, pbuffer, pconf, count, k); detectionsData[n * classesNum] = k; } } // apply locData(offset) to priordata, generate decodedBox -inline void DetectionOutput::decodeBBoxes(const float *priorData, - const float *locData, - const float *varianceData, - float *decodedBboxes, - float *decodedBboxSizes, - int* numPriorsActual, - int n, - const int& offs, - const int& priorSize, - bool decodeType, - const int *confInfoH, - const int *confInfoV) { +inline void DetectionOutput::decodeBBoxes(const float* priorData, + const float* locData, + const float* varianceData, + float* decodedBboxes, + float* decodedBboxSizes, + int* numPriorsActual, + int n, + const int& offs, + const int& priorSize, + bool decodeType, + const int* confInfoH, + const int* confInfoV) { int prNum = numPriorsActual[n]; if (!decodeType) { prNum = priorsNum; @@ -672,8 +766,8 @@ inline void DetectionOutput::decodeBBoxes(const float *priorData, newYMax = priorYMax + varianceData[p * 4 + 3] * locYMax; } } else if (codeType == CodeType::CENTER_SIZE) { - float priorWidth = priorXMax - priorXMin; - float priorHeight = priorYMax - priorYMin; + float priorWidth = priorXMax - priorXMin; + float priorHeight = priorYMax - priorYMin; float priorCenterX = (priorXMin + priorXMax) / 2.0f; float priorCenterY = (priorYMin + priorYMax) / 2.0f; @@ -682,21 +776,21 @@ inline void DetectionOutput::decodeBBoxes(const float *priorData, if (varianceEncodedInTarget) { // variance is encoded in target, we simply need to restore the offset predictions. - decodeBboxCenterX = locXMin * priorWidth + priorCenterX; + decodeBboxCenterX = locXMin * priorWidth + priorCenterX; decodeBboxCenterY = locYMin * priorHeight + priorCenterY; - decodeBboxWidth = std::exp(locXMax) * priorWidth; + decodeBboxWidth = std::exp(locXMax) * priorWidth; decodeBboxHeight = std::exp(locYMax) * priorHeight; } else { // variance is encoded in bbox, we need to scale the offset accordingly. - decodeBboxCenterX = varianceData[p*4 + 0] * locXMin * priorWidth + priorCenterX; - decodeBboxCenterY = varianceData[p*4 + 1] * locYMin * priorHeight + priorCenterY; - decodeBboxWidth = std::exp(varianceData[p*4 + 2] * locXMax) * priorWidth; - decodeBboxHeight = std::exp(varianceData[p*4 + 3] * locYMax) * priorHeight; + decodeBboxCenterX = varianceData[p * 4 + 0] * locXMin * priorWidth + priorCenterX; + decodeBboxCenterY = varianceData[p * 4 + 1] * locYMin * priorHeight + priorCenterY; + decodeBboxWidth = std::exp(varianceData[p * 4 + 2] * locXMax) * priorWidth; + decodeBboxHeight = std::exp(varianceData[p * 4 + 3] * locYMax) * priorHeight; } - newXMin = decodeBboxCenterX - decodeBboxWidth / 2.0f; + newXMin = decodeBboxCenterX - decodeBboxWidth / 2.0f; newYMin = decodeBboxCenterY - decodeBboxHeight / 2.0f; - newXMax = decodeBboxCenterX + decodeBboxWidth / 2.0f; + newXMax = decodeBboxCenterX + decodeBboxWidth / 2.0f; newYMax = decodeBboxCenterY + decodeBboxHeight / 2.0f; } @@ -707,25 +801,20 @@ inline void DetectionOutput::decodeBBoxes(const float *priorData, newYMax = (std::max)(0.0f, (std::min)(1.0f, newYMax)); } - decodedBboxes[p*4 + 0] = newXMin; - decodedBboxes[p*4 + 1] = newYMin; - decodedBboxes[p*4 + 2] = newXMax; - decodedBboxes[p*4 + 3] = newYMax; + decodedBboxes[p * 4 + 0] = newXMin; + decodedBboxes[p * 4 + 1] = newYMin; + decodedBboxes[p * 4 + 2] = newXMax; + decodedBboxes[p * 4 + 3] = newYMax; decodedBboxSizes[p] = (newXMax - newXMin) * (newYMax - newYMin); }); } -inline void DetectionOutput::topk(const int *indicesIn, int *indicesOut, const float *conf, int n, int k) { - std::partial_sort_copy(indicesIn, indicesIn + n, - indicesOut, indicesOut + k, - ConfidenceComparatorDO(conf)); +inline void DetectionOutput::topk(const int* indicesIn, int* indicesOut, const float* conf, int n, int k) { + std::partial_sort_copy(indicesIn, indicesIn + n, indicesOut, indicesOut + k, ConfidenceComparatorDO(conf)); } -static inline float JaccardOverlap(const float *decodedBbox, - const float *bboxSizes, - const int idx1, - const int idx2) { +static inline float JaccardOverlap(const float* decodedBbox, const float* bboxSizes, const int idx1, const int idx2) { const float xmin1 = decodedBbox[idx1 * 4 + 0]; const float ymin1 = decodedBbox[idx1 * 4 + 1]; const float xmax1 = decodedBbox[idx1 * 4 + 2]; @@ -745,7 +834,7 @@ static inline float JaccardOverlap(const float *decodedBbox, float intersectXMax = (std::min)(xmax1, xmax2); float intersectYMax = (std::min)(ymax1, ymax2); - float intersectWidth = intersectXMax - intersectXMin; + float intersectWidth = intersectXMax - intersectXMin; float intersectHeight = intersectYMax - intersectYMin; if (intersectWidth <= 0 || intersectHeight <= 0) { @@ -760,10 +849,10 @@ static inline float JaccardOverlap(const float *decodedBbox, } inline void DetectionOutput::NMSCF(int* indicesIn, - int& detections, - int* indicesOut, - const float* bboxes, - const float* boxSizes) { + int& detections, + int* indicesOut, + const float* bboxes, + const float* boxSizes) { // nms for this class int countIn = detections; detections = 0; @@ -787,10 +876,10 @@ inline void DetectionOutput::NMSCF(int* indicesIn, } inline void DetectionOutput::NMSMX(int* indicesIn, - int* detections, - int* indicesOut, - const float* bboxes, - const float* sizes) { + int* detections, + int* indicesOut, + const float* bboxes, + const float* sizes) { // Input is candidate for image, output is candidate for each class within image int countIn = detections[0]; detections[0] = 0; @@ -801,8 +890,8 @@ inline void DetectionOutput::NMSMX(int* indicesIn, const int prior = idx % priorsNum; // nms within this class - int &ndetection = detections[cls]; - int *pindices = indicesOut + cls * priorsNum; + int& ndetection = detections[cls]; + int* pindices = indicesOut + cls * priorsNum; bool keep = true; for (int k = 0; k < ndetection; ++k) { @@ -825,8 +914,11 @@ inline void DetectionOutput::NMSMX(int* indicesIn, } } -inline void DetectionOutput::generateOutput(float* reorderedConfData, int* indicesData, int* detectionsData, float* decodedBboxesData, - float* dstData) { +inline void DetectionOutput::generateOutput(float* reorderedConfData, + int* indicesData, + int* detectionsData, + float* decodedBboxesData, + float* dstData) { const auto& outDims = getChildEdgeAt(0)->getMemory().getStaticDims(); const int numResults = outDims[2]; const int DETECTION_SIZE = outDims[3]; @@ -850,26 +942,22 @@ inline void DetectionOutput::generateOutput(float* reorderedConfData, int* indic // set final detection result to output blob int count = 0; for (int n = 0; n < imgNum; ++n) { - const float *pconf = reorderedConfData + n * confInfoLen * classesNum; - const float *pboxes = decodedBboxesData + n * priorsNum * 4 * locNumForClasses; - const int *pindices = indicesData + n * classesNum * priorsNum; + const float* pconf = reorderedConfData + n * confInfoLen * classesNum; + const float* pboxes = decodedBboxesData + n * priorsNum * 4 * locNumForClasses; + const int* pindices = indicesData + n * classesNum * priorsNum; for (int c = 0; c < classesNum; ++c) { for (int i = 0; i < detectionsData[n * classesNum + c]; ++i) { int prIdx = pindices[c * priorsNum + i]; dstData[count * DETECTION_SIZE + 0] = static_cast(n); - dstData[count * DETECTION_SIZE + 1] = static_cast(decreaseClassId ? c-1 : c); + dstData[count * DETECTION_SIZE + 1] = static_cast(decreaseClassId ? c - 1 : c); dstData[count * DETECTION_SIZE + 2] = pconf[c * confInfoLen + prIdx]; - float xmin = isShareLoc ? pboxes[prIdx * 4 + 0] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 0]; - float ymin = isShareLoc ? pboxes[prIdx * 4 + 1] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 1]; - float xmax = isShareLoc ? pboxes[prIdx * 4 + 2] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 2]; - float ymax = isShareLoc ? pboxes[prIdx * 4 + 3] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 3]; + float xmin = isShareLoc ? pboxes[prIdx * 4 + 0] : pboxes[c * 4 * priorsNum + prIdx * 4 + 0]; + float ymin = isShareLoc ? pboxes[prIdx * 4 + 1] : pboxes[c * 4 * priorsNum + prIdx * 4 + 1]; + float xmax = isShareLoc ? pboxes[prIdx * 4 + 2] : pboxes[c * 4 * priorsNum + prIdx * 4 + 2]; + float ymax = isShareLoc ? pboxes[prIdx * 4 + 3] : pboxes[c * 4 * priorsNum + prIdx * 4 + 3]; if (clipAfterNMS) { xmin = (std::max)(0.0f, (std::min)(1.0f, xmin)); @@ -898,6 +986,6 @@ bool DetectionOutput::created() const { return getType() == Type::DetectionOutput; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/detection_output.h b/src/plugins/intel_cpu/src/nodes/detection_output.h index 418898f011f313..1a42bfa9b2980a 100644 --- a/src/plugins/intel_cpu/src/nodes/detection_output.h +++ b/src/plugins/intel_cpu/src/nodes/detection_output.h @@ -15,7 +15,7 @@ class DetectionOutput : public Node { public: DetectionOutput(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -47,8 +47,8 @@ class DetectionOutput : public Node { float sparsityThreshold = 0.03f; int topK = 0; float NMSThreshold = 0.0f; - bool clipBeforeNMS = false; - bool clipAfterNMS = false; + bool clipBeforeNMS = false; + bool clipAfterNMS = false; int backgroundClassId = 0; bool decreaseClassId = false; int keepTopK = 0; @@ -75,28 +75,52 @@ class DetectionOutput : public Node { inline void confFilterCF(const float* pConf, int* pindices, int* pbuffer, int* detectionsData, const int& n); - inline void confFilterMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData, const int& n); - - inline void confReorderAndFilterSparsityCF(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData); - - inline void confReorderAndFilterSparsityMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData); - - inline void decodeBBoxes(const float* prior_data, const float* loc_data, const float* variance_data, - float* decoded_bboxes, float* decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size, - bool decodeType = true, const int* conf_info_h = nullptr, const int* conf_info_v = nullptr); // decodeType is false after ARM - - inline void NMSCF(int* indicesIn, int& detections, int* indicesOut, - const float* bboxes, const float* boxSizes); - - inline void NMSMX(int* indicesIn, int* detections, int* indicesOut, - const float* bboxes, const float* sizes); + inline void confFilterMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData, + const int& n); + + inline void confReorderAndFilterSparsityCF(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData); + + inline void confReorderAndFilterSparsityMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData); + + inline void decodeBBoxes(const float* prior_data, + const float* loc_data, + const float* variance_data, + float* decoded_bboxes, + float* decoded_bbox_sizes, + int* num_priors_actual, + int n, + const int& offs, + const int& pr_size, + bool decodeType = true, + const int* conf_info_h = nullptr, + const int* conf_info_v = nullptr); // decodeType is false after ARM + + inline void NMSCF(int* indicesIn, int& detections, int* indicesOut, const float* bboxes, const float* boxSizes); + + inline void NMSMX(int* indicesIn, int* detections, int* indicesOut, const float* bboxes, const float* sizes); inline void topk(const int* indicesIn, int* indicesOut, const float* conf, int n, int k); - inline void generateOutput(float* reorderedConfData, int* indicesData, int* detectionsData, float* decodedBboxesData, float* dstData); + inline void generateOutput(float* reorderedConfData, + int* indicesData, + int* detectionsData, + float* decodedBboxesData, + float* dstData); std::vector decodedBboxes; std::vector indicesBuffer; @@ -110,6 +134,6 @@ class DetectionOutput : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/dft.cpp b/src/plugins/intel_cpu/src/nodes/dft.cpp index 76ecbbb36617f5..5fa8053d7024d7 100644 --- a/src/plugins/intel_cpu/src/nodes/dft.cpp +++ b/src/plugins/intel_cpu/src/nodes/dft.cpp @@ -4,17 +4,17 @@ #include "dft.h" +#include +#include #include #include -#include -#include "dnnl_extension_utils.h" -#include "openvino/core/parallel.hpp" +#include "common/cpu_memcpy.h" +#include "dnnl_extension_utils.h" #include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" #include "utils/general_utils.h" -#include "common/cpu_memcpy.h" #include "utils/ngraph_utils.hpp" -#include using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -104,10 +104,10 @@ void DFT::initSupportedPrimitiveDescriptors() { } } - std::vector inDataConfigurators({{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::i32}}); + std::vector inDataConfigurators( + {{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::i32}}); if (inputShapes.size() > SIGNAL_SIZE_INDEX) - inDataConfigurators.push_back({LayoutType::ncsp, ov::element::i32}); + inDataConfigurators.push_back({LayoutType::ncsp, ov::element::i32}); addSupportedPrimDesc(inDataConfigurators, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -172,8 +172,12 @@ size_t calculateOffsetFromStrides(const std::vector& coords, const std:: return offset; } -void gatherToBufferND(float* buffer, const float* data, size_t axis, const std::vector& dimIndexes, - const std::vector& shape, const std::vector& strides) { +void gatherToBufferND(float* buffer, + const float* data, + size_t axis, + const std::vector& dimIndexes, + const std::vector& shape, + const std::vector& strides) { size_t numberOfComplex = shape[axis]; size_t offset = calculateOffsetFromStrides(dimIndexes, strides); @@ -184,8 +188,12 @@ void gatherToBufferND(float* buffer, const float* data, size_t axis, const std:: } } -void applyBufferND(const float* buffer, float* output, size_t axis, const std::vector& dimIndexes, - const std::vector& shape, const std::vector& strides) { +void applyBufferND(const float* buffer, + float* output, + size_t axis, + const std::vector& dimIndexes, + const std::vector& shape, + const std::vector& strides) { size_t numberOfComplex = shape[axis]; size_t offset = calculateOffsetFromStrides(dimIndexes, strides); @@ -196,8 +204,12 @@ void applyBufferND(const float* buffer, float* output, size_t axis, const std::v } } -void copyDataToOutputWithSignalSize(const float* input, const std::vector& inputShape, const std::vector& inputStrides, - float* output, const std::vector& outputShape, const std::vector& outputStrides) { +void copyDataToOutputWithSignalSize(const float* input, + const std::vector& inputShape, + const std::vector& inputStrides, + float* output, + const std::vector& outputShape, + const std::vector& outputStrides) { auto totalInput = std::accumulate(inputShape.begin(), inputShape.end(), size_t(1), std::multiplies()); auto totalOutput = std::accumulate(outputShape.begin(), outputShape.end(), size_t(1), std::multiplies()); std::fill_n(output, totalOutput, 0.f); @@ -221,7 +233,10 @@ void copyDataToOutputWithSignalSize(const float* input, const std::vector inputStridesRange(inputStrides.begin(), inputStrides.begin() + iterationRange.size()); const std::vector outputStridesRange(outputStrides.begin(), outputStrides.begin() + iterationRange.size()); - const size_t blockSize = std::accumulate(inputShape.begin() + lastChangedDim + 1, inputShape.end(), size_t(1), std::multiplies()); + const size_t blockSize = std::accumulate(inputShape.begin() + lastChangedDim + 1, + inputShape.end(), + size_t(1), + std::multiplies()); const size_t blockSizeBytes = blockSize * sizeof(float); std::vector iterationCounter(iterationRange.size(), 0); do { @@ -231,7 +246,7 @@ void copyDataToOutputWithSignalSize(const float* input, const std::vectorgetMemory().getStaticDims(); @@ -269,7 +284,8 @@ void DFT::execute(dnnl::stream strm) { if (inputShape != outputShape) { copyDataToOutputWithSignalSize(src, inputShape, inputStrides, dst, outputShape, outputStrides); } else { - auto totalElements = std::accumulate(inputShape.begin(), inputShape.end(), size_t(1), std::multiplies()); + auto totalElements = + std::accumulate(inputShape.begin(), inputShape.end(), size_t(1), std::multiplies()); cpu_memcpy(dst, src, totalElements * sizeof(float)); } @@ -315,17 +331,32 @@ void DFT::dftNd(float* output, std::vector gatheredData(outputLen * 2); auto parallelIterationCounter = iterationCounter; parallelIterationCounter[parallelDimIndex] = dim; - gatherToBufferND(gatheredData.data(), output, currentAxis, parallelIterationCounter, outputShape, outputStrides); + gatherToBufferND(gatheredData.data(), + output, + currentAxis, + parallelIterationCounter, + outputShape, + outputStrides); const float* resultBufPtr; fft(gatheredData.data(), gatheredData.data() + outputLen, outputLen, inverse, false, &resultBufPtr); - applyBufferND(resultBufPtr, output, currentAxis, parallelIterationCounter, outputShape, outputStrides); + applyBufferND(resultBufPtr, + output, + currentAxis, + parallelIterationCounter, + outputShape, + outputStrides); }); iterationCounter[parallelDimIndex] = iterationRange[parallelDimIndex] - 1; } while (nextIterationStep(iterationCounter, iterationRange, currentAxis)); } else { std::vector gatheredData(outputLen); do { - gatherToBufferND(gatheredData.data(), output, currentAxis, iterationCounter, outputShape, outputStrides); + gatherToBufferND(gatheredData.data(), + output, + currentAxis, + iterationCounter, + outputShape, + outputStrides); naiveDFT(gatheredData.data(), outputLen, inverse); applyBufferND(gatheredData.data(), output, currentAxis, iterationCounter, outputShape, outputStrides); } while (nextIterationStep(iterationCounter, iterationRange, currentAxis)); @@ -585,6 +616,6 @@ void DFT::createJITKernels(bool hasDFT, bool hasFFT) { } #endif } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/dft.h b/src/plugins/intel_cpu/src/nodes/dft.h index 82b6ea3b33a618..eef5e2ea529066 100644 --- a/src/plugins/intel_cpu/src/nodes/dft.h +++ b/src/plugins/intel_cpu/src/nodes/dft.h @@ -63,6 +63,6 @@ class DFT : public Node { bool lastInverse; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 54cf435009059d..5daefa01eddfab 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -3,6 +3,18 @@ // #include "eltwise.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "common/cpu_convert.h" #include "common/float16.hpp" #include "common/primitive_hashing_utils.hpp" @@ -10,6 +22,10 @@ #include "cpu/ref_eltwise.hpp" #include "cpu_types.h" #include "dnnl_extension_utils.h" +#include "emitters/plugin/x64/jit_bf16_emitters.hpp" +#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" +#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" +#include "emitters/plugin/x64/jit_emitter.hpp" #include "fake_quantize.h" #include "input.h" #include "memory_desc/dnnl_blocked_memory_desc.h" @@ -17,13 +33,13 @@ #include "onednn/dnnl.h" #include "openvino/core/except.hpp" #include "openvino/core/parallel.hpp" -#include "openvino/opsets/opset1.hpp" #include "openvino/op/bitwise_and.hpp" #include "openvino/op/bitwise_left_shift.hpp" #include "openvino/op/bitwise_not.hpp" #include "openvino/op/bitwise_or.hpp" #include "openvino/op/bitwise_right_shift.hpp" #include "openvino/op/bitwise_xor.hpp" +#include "openvino/opsets/opset1.hpp" #include "pooling.h" #include "selective_build.h" #include "shape_inference/custom/eltwise.hpp" @@ -35,27 +51,10 @@ #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" -#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" -#include "emitters/plugin/x64/jit_bf16_emitters.hpp" - #if defined(OPENVINO_ARCH_ARM64) -#include "cpu/aarch64/cpu_isa_traits.hpp" -#include "kernels/aarch64/jit_uni_eltwise_generic.hpp" -#include "executors/aarch64/jit_eltwise.hpp" +# include "cpu/aarch64/cpu_isa_traits.hpp" +# include "executors/aarch64/jit_eltwise.hpp" +# include "kernels/aarch64/jit_uni_eltwise_generic.hpp" #endif using namespace dnnl::impl::utils; @@ -92,60 +91,72 @@ bool jitIsSupported(const Node* node, beta, gamma); } -} // namespace +} // namespace #endif #if defined(OPENVINO_ARCH_X86_64) -template +template struct SupportedPrecisions { - void operator()(std::set> &precisions) { + void operator()(std::set>& precisions) { precisions = T::get_supported_precisions(); } }; struct EltwiseEmitterContext { std::shared_ptr emitter; - jit_generator *host; + jit_generator* host; cpu_isa_t host_isa; const EltwiseData& opData; ov::element::Type exec_prc; }; -template +template struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { + void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { + void operator()(EltwiseEmitterContext& ctx) { auto algKind = static_cast(ctx.opData.onednnAlgorithm); - ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, algKind, - ctx.opData.alpha, ctx.opData.beta, ctx.exec_prc); + ctx.emitter = std::make_shared(ctx.host, + ctx.host_isa, + algKind, + ctx.opData.alpha, + ctx.opData.beta, + ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { - ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.opData.alpha, - ctx.opData.beta, ctx.opData.gamma, ctx.exec_prc); + void operator()(EltwiseEmitterContext& ctx) { + ctx.emitter = std::make_shared(ctx.host, + ctx.host_isa, + ctx.opData.alpha, + ctx.opData.beta, + ctx.opData.gamma, + ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { - ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc, ctx.opData.alpha, ctx.opData.beta); + void operator()(EltwiseEmitterContext& ctx) { + ctx.emitter = std::make_shared(ctx.host, + ctx.host_isa, + ctx.exec_prc, + ctx.opData.alpha, + ctx.opData.beta); } }; static void set_intersection(const std::set>& precisions1, - const std::set>& precisions2, - std::set>& intersection) { + const std::set>& precisions2, + std::set>& intersection) { std::map intersection_types; for (auto it1 = precisions1.begin(); it1 != precisions1.end(); ++it1) { @@ -195,15 +206,8 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu supported_precision_intersection = prcs_intersect; } - static const element::Type exec_precisions_priority[] = { - element::u8, - element::i8, - element::u16, - element::i16, - element::bf16, - element::i32, - element::f32 - }; + static const element::Type exec_precisions_priority[] = + {element::u8, element::i8, element::u16, element::i16, element::bf16, element::i32, element::f32}; for (const auto prc : exec_precisions_priority) { if (std::any_of(supported_precision_intersection.begin(), @@ -234,59 +238,62 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu std::set> eltwise_precision_helper::get_supported_precisions(const Algorithm& algo) { std::set> precisions; - OV_SWITCH(intel_cpu, SupportedPrecisions, precisions, algo, - OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), - OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), - OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), - OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), - OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), - OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), - OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), - OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), - OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), - OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), - OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), - OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), - OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), - OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), - OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), - OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); + OV_SWITCH(intel_cpu, + SupportedPrecisions, + precisions, + algo, + OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), + OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), + OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), + OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), + OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), + OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), + OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), + OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), + OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), + OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), + OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), + OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), + OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), + OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); if (precisions.empty()) OPENVINO_THROW("Unsupported operation type for Eltwise emitter"); @@ -302,7 +309,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener const std::vector& eltwise_data, const std::vector& ops_list, const dnnl::post_ops& post_ops) - : jit_uni_eltwise_kernel(jep), jit_generator(jit_name()), eltwise_data_(eltwise_data), ops_list_(ops_list), post_ops_(post_ops) {} + : jit_uni_eltwise_kernel(jep), + jit_generator(jit_name()), + eltwise_data_(eltwise_data), + ops_list_(ops_list), + post_ops_(post_ops) {} void create_ker() override { jit_generator::create_kernel(); @@ -322,14 +333,18 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener if (!p->entry_[i].is_quantization()) { OPENVINO_THROW("Eltwise jitter error. Unsupported post op detected"); } - quantization_injectors.push_back(std::make_shared>( - this, p->entry_[i], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); + quantization_injectors.push_back(std::make_shared>(this, + p->entry_[i], + vmm_d_weights, + vmm_d_bias, + reg_d_weights, + reg_d_bias)); } if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa)); - const auto &jep = jep_; + const auto& jep = jep_; this->preamble(); @@ -435,7 +450,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener for (size_t j = 0; j < min_src_size / vec_step; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) - load_vector(get_vmm_reg(i), ptr[get_src_reg(i) + j * vec_step * jep.src_prc[i].size()], jep.src_prc[i], exec_prc, false); + load_vector(get_vmm_reg(i), + ptr[get_src_reg(i) + j * vec_step * jep.src_prc[i].size()], + jep.src_prc[i], + exec_prc, + false); } compute_eltwise_op(); @@ -449,7 +468,10 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener for (size_t j = tail_start; j < min_src_size; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) - load_scalar(get_xmm_reg(i), ptr[get_src_reg(i) + j * jep.src_prc[i].size()], jep.src_prc[i], exec_prc); + load_scalar(get_xmm_reg(i), + ptr[get_src_reg(i) + j * jep.src_prc[i].size()], + jep.src_prc[i], + exec_prc); } compute_eltwise_op(); @@ -571,7 +593,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } Reg64 reg_post_op_ptrs = rax; - Reg64 start_to_offsets = reg_post_op_ptrs; // rax + Reg64 start_to_offsets = reg_post_op_ptrs; // rax Reg64 reg_dst = rbx; Reg64 reg_work_amount = rdx; @@ -606,67 +628,64 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener const dnnl::post_ops& post_ops_; std::shared_ptr create_eltwise_emitter(const EltwiseData& data, ov::element::Type exec_prec) { - EltwiseEmitterContext ctx = { - nullptr, - this, - isa, - data, - exec_prec - }; - - OV_SWITCH(intel_cpu, EltwiseEmitter, ctx, data.algo, - OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), - OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), - OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), - OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), - OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), - OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), - OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), - OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), - OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), - OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), - OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), - OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), - OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), - OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), - OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), - OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); + EltwiseEmitterContext ctx = {nullptr, this, isa, data, exec_prec}; + + OV_SWITCH(intel_cpu, + EltwiseEmitter, + ctx, + data.algo, + OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), + OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), + OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), + OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), + OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), + OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), + OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), + OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), + OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), + OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), + OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), + OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), + OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), + OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); if (!ctx.emitter) OPENVINO_THROW("Unsupported operation type for Eltwise emitter"); @@ -714,17 +733,31 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener bool do_rounding = do_dequantization || jep_.dst_prc == ov::element::f32 || i != ops_list_.size() - 1; int s_idx = vmm_dst.getIdx(); - size_t ptrs_table_off = quantization_post_op_idx * quantization_injectors[quantization_post_op_idx]->memoryStep(); - - quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); - quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); - - quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); - quantization_injectors[quantization_post_op_idx]->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding, - is_scalar, jep_.oc_size == 1); - - quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); - quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); + size_t ptrs_table_off = + quantization_post_op_idx * quantization_injectors[quantization_post_op_idx]->memoryStep(); + + quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_post_op_ptrs + ptrs_table_off, + reg_oc_off); + quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, + s_idx + 1, + offset, + is_scalar, + jep_.oc_size == 1); + + quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs( + reg_post_op_ptrs + ptrs_table_off, + reg_oc_off); + quantization_injectors[quantization_post_op_idx] + ->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding, is_scalar, jep_.oc_size == 1); + + quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs( + reg_post_op_ptrs + ptrs_table_off, + reg_oc_off); + quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, + s_idx + 1, + offset, + is_scalar, + jep_.oc_size == 1); quantization_post_op_idx++; } else { @@ -733,7 +766,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } } - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, ov::element::Type src_prc, ov::element::Type dst_prc, bool broadcast) { + inline void load_vector(Vmm vmm_src, + const Xbyak::Address& op, + ov::element::Type src_prc, + ov::element::Type dst_prc, + bool broadcast) { Xmm xmm_src = Xmm(vmm_src.getIdx()); if (src_prc == dst_prc) { @@ -751,120 +788,126 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vbroadcastss(vmm_src, xmm_src); } else { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(vmm_src, op); - break; - case ov::element::bf16: - vpmovzxwd(vmm_src, op); - uni_vpslld(vmm_src, vmm_src, 16); - break; - case ov::element::f16: - vcvtph2ps(vmm_src, op); - break; - case ov::element::u16: - uni_vpmovzxwd(vmm_src, op); - break; - case ov::element::i16: - uni_vpmovsxwd(vmm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(vmm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(vmm_src, op); - break; - default: - OPENVINO_THROW("unknown src_prc"); - } - - switch (dst_prc) { - case ov::element::f32: - if (!src_prc.is_real()) - uni_vcvtdq2ps(vmm_src, vmm_src); - break; - case ov::element::i32: - if (src_prc.is_real()) - uni_vcvtps2dq(vmm_src, vmm_src); - break; - default: - OPENVINO_THROW("unknown dst_prc"); - } - } - } - - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, ov::element::Type src_prc, ov::element::Type dst_prc) { - if (src_prc == dst_prc) { - switch (src_prc.size()) { - case 4: - uni_vmovss(xmm_src, op); - break; - case 1: - mov(reg_tmp_8, op); - movzx(reg_tmp_32, reg_tmp_8); - uni_vmovd(xmm_src, reg_tmp_32); - break; - default: - OPENVINO_THROW("unknown prc"); - } - return; - } - - switch (src_prc) { case ov::element::f32: case ov::element::i32: - uni_vmovss(xmm_src, op); + uni_vmovups(vmm_src, op); break; case ov::element::bf16: - if (isa == x64::avx2_vnni_2) { - vbcstnebf162ps(xmm_src, op); - } else { - uni_vpinsrw(xmm_src, xmm_src, op, 0); - uni_vpslld(xmm_src, xmm_src, 16); - } + vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); break; case ov::element::f16: - if (isa == x64::avx2_vnni_2) { - vbcstnesh2ps(xmm_src, op); - } else { - vcvtph2ps(xmm_src, op); - } - break; - case ov::element::i16: - uni_vpinsrw(xmm_src, xmm_src, op, 0); - uni_vpmovsxwd(xmm_src, op); + vcvtph2ps(vmm_src, op); break; case ov::element::u16: - uni_vpinsrw(xmm_src, xmm_src, op, 0); - uni_vpmovzxwd(xmm_src, op); + uni_vpmovzxwd(vmm_src, op); + break; + case ov::element::i16: + uni_vpmovsxwd(vmm_src, op); break; case ov::element::i8: - movsx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); + uni_vpmovsxbd(vmm_src, op); break; case ov::element::u8: - movzx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); + uni_vpmovzxbd(vmm_src, op); break; default: OPENVINO_THROW("unknown src_prc"); - } + } - switch (dst_prc) { + switch (dst_prc) { case ov::element::f32: if (!src_prc.is_real()) - uni_vcvtdq2ps(xmm_src, xmm_src); + uni_vcvtdq2ps(vmm_src, vmm_src); break; case ov::element::i32: if (src_prc.is_real()) - uni_vcvtps2dq(xmm_src, xmm_src); + uni_vcvtps2dq(vmm_src, vmm_src); break; default: OPENVINO_THROW("unknown dst_prc"); + } + } + } + + inline void load_scalar(Xmm xmm_src, + const Xbyak::Address& op, + ov::element::Type src_prc, + ov::element::Type dst_prc) { + if (src_prc == dst_prc) { + switch (src_prc.size()) { + case 4: + uni_vmovss(xmm_src, op); + break; + case 1: + mov(reg_tmp_8, op); + movzx(reg_tmp_32, reg_tmp_8); + uni_vmovd(xmm_src, reg_tmp_32); + break; + default: + OPENVINO_THROW("unknown prc"); + } + return; + } + + switch (src_prc) { + case ov::element::f32: + case ov::element::i32: + uni_vmovss(xmm_src, op); + break; + case ov::element::bf16: + if (isa == x64::avx2_vnni_2) { + vbcstnebf162ps(xmm_src, op); + } else { + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpslld(xmm_src, xmm_src, 16); + } + break; + case ov::element::f16: + if (isa == x64::avx2_vnni_2) { + vbcstnesh2ps(xmm_src, op); + } else { + vcvtph2ps(xmm_src, op); + } + break; + case ov::element::i16: + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpmovsxwd(xmm_src, op); + break; + case ov::element::u16: + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpmovzxwd(xmm_src, op); + break; + case ov::element::i8: + movsx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + case ov::element::u8: + movzx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + default: + OPENVINO_THROW("unknown src_prc"); + } + + switch (dst_prc) { + case ov::element::f32: + if (!src_prc.is_real()) + uni_vcvtdq2ps(xmm_src, xmm_src); + break; + case ov::element::i32: + if (src_prc.is_real()) + uni_vcvtps2dq(xmm_src, xmm_src); + break; + default: + OPENVINO_THROW("unknown dst_prc"); } } - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, ov::element::Type src_prc, ov::element::Type dst_prc) { + inline void store_vector(const Xbyak::Address& op, + Vmm vmm_dst, + ov::element::Type src_prc, + ov::element::Type dst_prc) { Xmm xmm_dst = Xmm(vmm_dst.getIdx()); Ymm ymm_dst = Ymm(vmm_dst.getIdx()); @@ -874,170 +917,173 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } switch (src_prc) { - case ov::element::f32: - if (!dst_prc.is_real()) - uni_vcvtps2dq(vmm_dst, vmm_dst); - break; - case ov::element::i32: - if (dst_prc.is_real()) - uni_vcvtdq2ps(vmm_dst, vmm_dst); - break; - default: - OPENVINO_THROW("unknown src_prc"); + case ov::element::f32: + if (!dst_prc.is_real()) + uni_vcvtps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i32: + if (dst_prc.is_real()) + uni_vcvtdq2ps(vmm_dst, vmm_dst); + break; + default: + OPENVINO_THROW("unknown src_prc"); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, vmm_dst); - break; - case ov::element::bf16: - if (isa == x64::avx512_core) { - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); - } else { - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(xmm_dst.getIdx())}); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, vmm_dst); + break; + case ov::element::bf16: + if (isa == x64::avx512_core) { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + } else { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); + uni_vmovdqu(op, xmm_dst); + } + break; + case ov::element::f16: + vcvtps2ph(op, vmm_dst, 0x4); + break; + case ov::element::i16: + if (isa == x64::avx512_core) { + vpmovsdw(op, vmm_dst); + } else { + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) { + vpermq(ymm_dst, ymm_dst, 0x08); uni_vmovdqu(op, xmm_dst); - } - break; - case ov::element::f16: - vcvtps2ph(op, vmm_dst, 0x4); - break; - case ov::element::i16: - if (isa == x64::avx512_core) { - vpmovsdw(op, vmm_dst); } else { - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) { - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vmovdqu(op, xmm_dst); - } else { - movq(op, xmm_dst); - } + movq(op, xmm_dst); } - break; - case ov::element::u16: - if (isa == x64::avx512_core) { - vpmaxsd(vmm_dst, vmm_zero, vmm_dst); - vpmovusdw(op, vmm_dst); - } else { - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) { - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vmovdqu(op, xmm_dst); - } else { - movq(op, xmm_dst); - } - } - break; - case ov::element::i8: - if (isa == x64::avx512_core) { - vpmovsdb(op, vmm_dst); - } else { - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); - } - break; - case ov::element::u8: - if (isa == x64::avx512_core) { - vpmaxsd(vmm_dst, vmm_zero, vmm_dst); - vpmovusdb(op, vmm_dst); + } + break; + case ov::element::u16: + if (isa == x64::avx512_core) { + vpmaxsd(vmm_dst, vmm_zero, vmm_dst); + vpmovusdw(op, vmm_dst); + } else { + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) { + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vmovdqu(op, xmm_dst); } else { - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); + movq(op, xmm_dst); } - break; - default: - OPENVINO_THROW("unknown dst_prc"); + } + break; + case ov::element::i8: + if (isa == x64::avx512_core) { + vpmovsdb(op, vmm_dst); + } else { + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case ov::element::u8: + if (isa == x64::avx512_core) { + vpmaxsd(vmm_dst, vmm_zero, vmm_dst); + vpmovusdb(op, vmm_dst); + } else { + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + default: + OPENVINO_THROW("unknown dst_prc"); } } - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, ov::element::Type src_prc, ov::element::Type dst_prc) { + inline void store_scalar(const Xbyak::Address& op, + Xmm xmm_dst, + ov::element::Type src_prc, + ov::element::Type dst_prc) { if (src_prc == dst_prc) { switch (src_prc.size()) { - case 4: - uni_vmovss(op, xmm_dst); - break; - case 1: - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - OPENVINO_THROW("unknown prc"); + case 4: + uni_vmovss(op, xmm_dst); + break; + case 1: + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + OPENVINO_THROW("unknown prc"); } return; } switch (src_prc) { - case ov::element::f32: - if (!dst_prc.is_real()) - uni_vcvtps2dq(xmm_dst, xmm_dst); - break; - case ov::element::i32: - if (dst_prc.is_real()) - uni_vcvtdq2ps(xmm_dst, xmm_dst); - break; - default: - OPENVINO_THROW("unknown src_prc"); + case ov::element::f32: + if (!dst_prc.is_real()) + uni_vcvtps2dq(xmm_dst, xmm_dst); + break; + case ov::element::i32: + if (dst_prc.is_real()) + uni_vcvtdq2ps(xmm_dst, xmm_dst); + break; + default: + OPENVINO_THROW("unknown src_prc"); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovss(op, xmm_dst); - break; - case ov::element::bf16: - uni_vpsrld(xmm_dst, xmm_dst, 16); - uni_vpextrw(op, xmm_dst, 0x0); - break; - case ov::element::f16: - vcvtps2ph(xmm_dst, xmm_dst, 0x4); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_16); - break; - case ov::element::i16: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_16); - break; - case ov::element::u16: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_16); - break; - case ov::element::i8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - case ov::element::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - OPENVINO_THROW("unknown dst_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovss(op, xmm_dst); + break; + case ov::element::bf16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + uni_vpextrw(op, xmm_dst, 0x0); + break; + case ov::element::f16: + vcvtps2ph(xmm_dst, xmm_dst, 0x4); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_16); + break; + case ov::element::i16: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_16); + break; + case ov::element::u16: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_16); + break; + case ov::element::i8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case ov::element::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + OPENVINO_THROW("unknown dst_prc"); } } }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 Eltwise::BroadcastingPolicy Eltwise::determineBroadcastingPolicy(const std::shared_ptr& op) { const auto const1 = ov::as_type_ptr(op->get_input_node_shared_ptr(0)); @@ -1297,7 +1343,6 @@ const std::map& Eltwise::getIn return initializers; } - namespace { struct EltwiseKey { @@ -1353,12 +1398,8 @@ struct EltwiseKey { return false; } - bool result = eltwise_data == rhs.eltwise_data && - ops_list == rhs.ops_list && - inpPrc == rhs.inpPrc && - outPrc == rhs.outPrc && - *postOps.get() == *rhs.postOps.get() && - implType == rhs.implType; + bool result = eltwise_data == rhs.eltwise_data && ops_list == rhs.ops_list && inpPrc == rhs.inpPrc && + outPrc == rhs.outPrc && *postOps.get() == *rhs.postOps.get() && implType == rhs.implType; if (result) { if (implType == EltwiseImplType::optimizedShapeAgnostic) { @@ -1370,8 +1411,7 @@ struct EltwiseKey { return false; } } else { - result = result && outOrder == rhs.outOrder && - outBlkDims == rhs.outBlkDims; + result = result && outOrder == rhs.outOrder && outBlkDims == rhs.outBlkDims; for (size_t i = 0; i < inpDims.size() && result; ++i) { result = result && (inpDims[i] == rhs.inpDims[i]); } @@ -1426,7 +1466,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { auto collapseLastOffsets = [](std::vector& dims, int dimsToCollapse) { for (size_t i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { if (dims[dims.size() - 1] > 0 || dims[i] > 0) - dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast(1)) * std::max(dims[i], static_cast(1)); + dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast(1)) * + std::max(dims[i], static_cast(1)); else dims[dims.size() - 1] *= dims[i]; } @@ -1442,8 +1483,10 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { auto isFusedWith = [&](Type type_) { auto start_itr = ops_list.begin(); - std::advance(start_itr, 1); // apply offset since the first op in the list is the op itself - return any_of(start_itr, ops_list.end(), [=](Type type) { return type == type_; }); + std::advance(start_itr, 1); // apply offset since the first op in the list is the op itself + return any_of(start_itr, ops_list.end(), [=](Type type) { + return type == type_; + }); }; if (inpDims.empty()) { @@ -1493,7 +1536,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { int oc_dim_idx = i + (jep.input_size - outOrder.size()); jep.oc_offsets[oc_dim_idx] = offset_oc; offset_oc *= jep.dims[oc_dim_idx]; - if (oc_dim_idx + 1 != static_cast(jep.input_size)) { // since in nspc case we can safely collapse the last axis + if (oc_dim_idx + 1 != + static_cast(jep.input_size)) { // since in nspc case we can safely collapse the last axis lastUnchangedAxis = oc_dim_idx; } } @@ -1514,7 +1558,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { int collapsedDims = 0; bool hasDifferentDims = false; - while (!useRuntimePtrs && currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) { + while (!useRuntimePtrs && currentJitWorkAmount < minimalJitWorkAmount && + currentJitWorkAmount < fullWorkAmount) { if (collapsedDims >= maxCollapsedDims) break; @@ -1595,8 +1640,9 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { jep.work_amount = jep.dst_size = jep.dims.back(); jep.oc_size = oc_size; - std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), - [](size_t& offset) { return offset * sizeof(float);}); + std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), [](size_t& offset) { + return offset * sizeof(float); + }); #if defined(OPENVINO_ARCH_X86_64) if (mayiuse(x64::avx512_core)) { @@ -1608,7 +1654,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } else { OPENVINO_THROW("Can't create jit eltwise kernel"); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 #if defined(OPENVINO_ARCH_ARM64) if (mayiuse(aarch64::asimd)) { @@ -1616,28 +1662,28 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } else { OPENVINO_THROW("Can't create jit eltwise kernel"); } -#endif // OPENVINO_ARCH_ARM64 +#endif // OPENVINO_ARCH_ARM64 if (_pKernel) _pKernel->create_ker(); } - void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) override { if (!_pKernel) OPENVINO_THROW("Can't execute, kernel for eltwise node is not compiled"); if (_pKernel->jep_.input_size == optimalTensorRank) { // execute Optimized 6D auto d6_loop = [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { - auto args = jit_eltwise_call_args_indexes(); - args.indexes[0] = i0; - args.indexes[1] = i1; - args.indexes[2] = i2; - args.indexes[3] = i3; - args.indexes[4] = i4; + auto args = jit_eltwise_call_args_indexes(); + args.indexes[0] = i0; + args.indexes[1] = i1; + args.indexes[2] = i2; + args.indexes[3] = i3; + args.indexes[4] = i4; - (*_pKernel)(&args_ptrs, &args); - }; + (*_pKernel)(&args_ptrs, &args); + }; parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { for_5d(ithr, nthr, dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], d6_loop); @@ -1693,13 +1739,14 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { /* enabled only for float at float16_t at the moment * can be extended in the future */ -template +template class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { public: EltwiseRefBaseExecutor(const EltwiseData& opData, const VectorDims& outBlkDims, const std::vector& inpDims) - : _opData(std::move(opData)), _inpDims(inpDims) { + : _opData(std::move(opData)), + _inpDims(inpDims) { if (inpDims.empty()) { OPENVINO_THROW("Can not make Eltwise executor from empty input dims array"); } else if (inpDims.front().empty()) { @@ -1750,18 +1797,18 @@ class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { protected: void init_ptr(const jit_eltwise_call_args_ptrs& args_ptrs, - const VectorDims& dims_out, - std::vector& counters, - const size_t iwork, - std::vector& src_f, - T*& dst_ptr_f) { + const VectorDims& dims_out, + std::vector& counters, + const size_t iwork, + std::vector& src_f, + T*& dst_ptr_f) { size_t tmp = iwork; for (ptrdiff_t j = dims_out.size() - 1; j >= 0; j--) { counters[j] = tmp % dims_out[j]; tmp /= dims_out[j]; } - size_t index_in[MAX_ELTWISE_INPUTS] = { 0 }; + size_t index_in[MAX_ELTWISE_INPUTS] = {0}; for (size_t i = 0; i < _inputNum; i++) { index_in[i] = 0; for (size_t j = 0; j < counters.size(); j++) { @@ -1776,7 +1823,7 @@ class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { } index_out /= sizeof(T); - //std::vector src_f(_inputNum); + // std::vector src_f(_inputNum); for (size_t i = 0; i < _inputNum; i++) { src_f[i] = (reinterpret_cast(args_ptrs.src_ptr[i]) + index_in[i])[0]; } @@ -1795,19 +1842,15 @@ class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { /* enabled only for float at float16_t at the moment * can be extended in the future */ -template::value || - std::is_same::value> - ::type * = nullptr> +template ::value || + std::is_same::value>::type* = nullptr> class EltwiseRefExecutor : public EltwiseRefBaseExecutor { public: - EltwiseRefExecutor(const EltwiseData& opData, - const VectorDims& outBlkDims, - std::vector inpDims) : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) { - } + EltwiseRefExecutor(const EltwiseData& opData, const VectorDims& outBlkDims, std::vector inpDims) + : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) {} - void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) override { if (this->_opData.algo == Algorithm::EltwiseLog) { const T* src_ptr_f = reinterpret_cast(args_ptrs.src_ptr[0]); T* dst_ptr_f = reinterpret_cast(args_ptrs.dst_ptr); @@ -1857,8 +1900,11 @@ class EltwiseRefExecutor : public EltwiseRefBaseExecutor { std::shared_ptr ref_eltwise_injector = nullptr; if (this->_opData.onednnAlgorithm != dnnl::algorithm::undef) { - ref_eltwise_injector = std::make_shared( - static_cast(this->_opData.onednnAlgorithm), this->_opData.alpha, this->_opData.beta, 1.f); + ref_eltwise_injector = + std::make_shared(static_cast(this->_opData.onednnAlgorithm), + this->_opData.alpha, + this->_opData.beta, + 1.f); } parallel_nt(0, [&](const int ithr, const int nthr) { @@ -1873,86 +1919,144 @@ class EltwiseRefExecutor : public EltwiseRefBaseExecutor { this->init_ptr(args_ptrs, dims_out, counters, iwork, src_f, dst_ptr_f); switch (this->_opData.algo) { - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseGeluTanh: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseAbs: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - case Algorithm::EltwiseMish: - case Algorithm::EltwiseHsigmoid: - case Algorithm::EltwiseRoundHalfToEven: - case Algorithm::EltwiseRoundHalfAwayFromZero: - *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); - break; - case Algorithm::EltwiseAdd: *dst_ptr_f = src_f[0] + src_f[1]; break; - case Algorithm::EltwiseMulAdd: *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; break; - case Algorithm::EltwiseSubtract: *dst_ptr_f = src_f[0] - src_f[1]; break; - case Algorithm::EltwiseMultiply: *dst_ptr_f = src_f[0] * src_f[1]; break; - case Algorithm::EltwiseDivide: *dst_ptr_f = src_f[0] / src_f[1]; break; - case Algorithm::EltwiseCeiling: *dst_ptr_f = ceilf(src_f[0]); break; - case Algorithm::EltwiseFloor: *dst_ptr_f = floorf(src_f[0]); break; - case Algorithm::EltwiseFloorMod: *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; break; - case Algorithm::EltwiseMod: *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; break; - case Algorithm::EltwiseMaximum: *dst_ptr_f = std::max(src_f[0], src_f[1]); break; - case Algorithm::EltwiseMinimum: *dst_ptr_f = std::min(src_f[0], src_f[1]); break; - case Algorithm::EltwiseExp: *dst_ptr_f = expf(src_f[0]); break; - case Algorithm::EltwiseSquaredDifference: *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); break; - case Algorithm::EltwisePowerDynamic: *dst_ptr_f = powf(src_f[0], src_f[1]); break; - case Algorithm::EltwiseEqual: *dst_ptr_f = src_f[0] == src_f[1]; break; - case Algorithm::EltwiseNotEqual: *dst_ptr_f = src_f[0] != src_f[1]; break; - case Algorithm::EltwiseGreater: *dst_ptr_f = src_f[0] > src_f[1]; break; - case Algorithm::EltwiseGreaterEqual: *dst_ptr_f = src_f[0] >= src_f[1]; break; - case Algorithm::EltwiseLess: *dst_ptr_f = src_f[0] < src_f[1]; break; - case Algorithm::EltwiseLessEqual: *dst_ptr_f = src_f[0] <= src_f[1]; break; - case Algorithm::EltwiseLogicalAnd: *dst_ptr_f = src_f[0] && src_f[1]; break; - case Algorithm::EltwiseLogicalOr: *dst_ptr_f = src_f[0] || src_f[1]; break; - case Algorithm::EltwiseLogicalXor: *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break; - case Algorithm::EltwiseLogicalNot: *dst_ptr_f = !src_f[0]; break; - case Algorithm::EltwisePrelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : static_cast(src_f[0] * src_f[1]); break; - case Algorithm::EltwiseErf: *dst_ptr_f = std::erf(src_f[0]); break; - case Algorithm::EltwiseSoftSign: *dst_ptr_f = src_f[0] / (1 + std::fabs(src_f[0])); break; - // @todo implement proper isinfinite for non-float precisions - case Algorithm::EltwiseIsFinite: *dst_ptr_f = std::isfinite(static_cast(src_f[0])); break; - case Algorithm::EltwiseIsInf: - *dst_ptr_f = (this->_opData.alpha && (src_f[0] == -std::numeric_limits::infinity())) || - (this->_opData.beta && (src_f[0] == std::numeric_limits::infinity())); - break; - case Algorithm::EltwiseIsNaN: *dst_ptr_f = std::isnan(src_f[0]); break; - case Algorithm::EltwiseSelect: *dst_ptr_f = src_f[0] ? src_f[1] : src_f[2]; break; - default: OPENVINO_THROW("Unsupported operation type for Eltwise executor"); + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseGeluTanh: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseAbs: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + case Algorithm::EltwiseMish: + case Algorithm::EltwiseHsigmoid: + case Algorithm::EltwiseRoundHalfToEven: + case Algorithm::EltwiseRoundHalfAwayFromZero: + *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); + break; + case Algorithm::EltwiseAdd: + *dst_ptr_f = src_f[0] + src_f[1]; + break; + case Algorithm::EltwiseMulAdd: + *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; + break; + case Algorithm::EltwiseSubtract: + *dst_ptr_f = src_f[0] - src_f[1]; + break; + case Algorithm::EltwiseMultiply: + *dst_ptr_f = src_f[0] * src_f[1]; + break; + case Algorithm::EltwiseDivide: + *dst_ptr_f = src_f[0] / src_f[1]; + break; + case Algorithm::EltwiseCeiling: + *dst_ptr_f = ceilf(src_f[0]); + break; + case Algorithm::EltwiseFloor: + *dst_ptr_f = floorf(src_f[0]); + break; + case Algorithm::EltwiseFloorMod: + *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; + break; + case Algorithm::EltwiseMod: + *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; + break; + case Algorithm::EltwiseMaximum: + *dst_ptr_f = std::max(src_f[0], src_f[1]); + break; + case Algorithm::EltwiseMinimum: + *dst_ptr_f = std::min(src_f[0], src_f[1]); + break; + case Algorithm::EltwiseExp: + *dst_ptr_f = expf(src_f[0]); + break; + case Algorithm::EltwiseSquaredDifference: + *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); + break; + case Algorithm::EltwisePowerDynamic: + *dst_ptr_f = powf(src_f[0], src_f[1]); + break; + case Algorithm::EltwiseEqual: + *dst_ptr_f = src_f[0] == src_f[1]; + break; + case Algorithm::EltwiseNotEqual: + *dst_ptr_f = src_f[0] != src_f[1]; + break; + case Algorithm::EltwiseGreater: + *dst_ptr_f = src_f[0] > src_f[1]; + break; + case Algorithm::EltwiseGreaterEqual: + *dst_ptr_f = src_f[0] >= src_f[1]; + break; + case Algorithm::EltwiseLess: + *dst_ptr_f = src_f[0] < src_f[1]; + break; + case Algorithm::EltwiseLessEqual: + *dst_ptr_f = src_f[0] <= src_f[1]; + break; + case Algorithm::EltwiseLogicalAnd: + *dst_ptr_f = src_f[0] && src_f[1]; + break; + case Algorithm::EltwiseLogicalOr: + *dst_ptr_f = src_f[0] || src_f[1]; + break; + case Algorithm::EltwiseLogicalXor: + *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); + break; + case Algorithm::EltwiseLogicalNot: + *dst_ptr_f = !src_f[0]; + break; + case Algorithm::EltwisePrelu: + *dst_ptr_f = src_f[0] > 0 ? src_f[0] : static_cast(src_f[0] * src_f[1]); + break; + case Algorithm::EltwiseErf: + *dst_ptr_f = std::erf(src_f[0]); + break; + case Algorithm::EltwiseSoftSign: + *dst_ptr_f = src_f[0] / (1 + std::fabs(src_f[0])); + break; + // @todo implement proper isinfinite for non-float precisions + case Algorithm::EltwiseIsFinite: + *dst_ptr_f = std::isfinite(static_cast(src_f[0])); + break; + case Algorithm::EltwiseIsInf: + *dst_ptr_f = (this->_opData.alpha && (src_f[0] == -std::numeric_limits::infinity())) || + (this->_opData.beta && (src_f[0] == std::numeric_limits::infinity())); + break; + case Algorithm::EltwiseIsNaN: + *dst_ptr_f = std::isnan(src_f[0]); + break; + case Algorithm::EltwiseSelect: + *dst_ptr_f = src_f[0] ? src_f[1] : src_f[2]; + break; + default: + OPENVINO_THROW("Unsupported operation type for Eltwise executor"); } } }); } }; -template::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value> - ::type * = nullptr> +template ::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value>::type* = nullptr> class BitwiseRefExecutor : public EltwiseRefBaseExecutor { public: - BitwiseRefExecutor(const EltwiseData& opData, - const VectorDims& outBlkDims, - const std::vector& inpDims) : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) { - } + BitwiseRefExecutor(const EltwiseData& opData, const VectorDims& outBlkDims, const std::vector& inpDims) + : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) {} - void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) override { std::shared_ptr ref_eltwise_injector = nullptr; if (this->_opData.onednnAlgorithm != dnnl::algorithm::undef) { - ref_eltwise_injector = std::make_shared( - static_cast(this->_opData.onednnAlgorithm), this->_opData.alpha, this->_opData.beta, 1.f); + ref_eltwise_injector = + std::make_shared(static_cast(this->_opData.onednnAlgorithm), + this->_opData.alpha, + this->_opData.beta, + 1.f); } parallel_nt(0, [&](const int ithr, const int nthr) { @@ -1967,81 +2071,79 @@ class BitwiseRefExecutor : public EltwiseRefBaseExecutor { this->init_ptr(args_ptrs, dims_out, counters, iwork, src_f, dst_ptr_f); switch (this->_opData.algo) { - case Algorithm::EltwiseBitwiseAnd: { - *dst_ptr_f = src_f[0] & src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseNot: { - *dst_ptr_f = ~src_f[0]; - break; - } - case Algorithm::EltwiseBitwiseOr: { - *dst_ptr_f = src_f[0] | src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseXor: { - *dst_ptr_f = src_f[0] ^ src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseLeftShift: { - *dst_ptr_f = src_f[0] << src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseRightShift: { - *dst_ptr_f = src_f[0] >> src_f[1]; - break; - } - default: - OPENVINO_THROW("Unsupported operation type for Eltwise executor"); + case Algorithm::EltwiseBitwiseAnd: { + *dst_ptr_f = src_f[0] & src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseNot: { + *dst_ptr_f = ~src_f[0]; + break; + } + case Algorithm::EltwiseBitwiseOr: { + *dst_ptr_f = src_f[0] | src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseXor: { + *dst_ptr_f = src_f[0] ^ src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseLeftShift: { + *dst_ptr_f = src_f[0] << src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseRightShift: { + *dst_ptr_f = src_f[0] >> src_f[1]; + break; + } + default: + OPENVINO_THROW("Unsupported operation type for Eltwise executor"); } } }); } }; -} // namespace +} // namespace static Eltwise::executorPtr buildRefExecutor(const EltwiseKey& key) { switch (key.outPrc) { - case ov::element::f16: - return std::make_shared>(key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - case ov::element::i8: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - - case ov::element::u8: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - - case ov::element::i16: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - - case ov::element::u16: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); + case ov::element::f16: + return std::make_shared>(key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + case ov::element::i8: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + + case ov::element::u8: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + + case ov::element::i16: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + + case ov::element::u16: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); # - case ov::element::i32: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); + case ov::element::i32: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); - default: - // use float reference executor for any other precision for now - return std::make_shared>(key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); + default: + // use float reference executor for any other precision for now + return std::make_shared>(key.eltwise_data.front(), key.outBlkDims, key.inpDims); } } @@ -2064,7 +2166,7 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) { bool Eltwise::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (getInitializers().find(op->get_type_info()) == getInitializers().end()) { - errorMessage = "Doesn't support Eltwise algorithm: " + std::string(op->get_type_name()); + errorMessage = "Doesn't support Eltwise algorithm: " + std::string(op->get_type_name()); return false; } if (const auto binOp = ov::as_type_ptr(op)) { @@ -2087,8 +2189,9 @@ bool Eltwise::isSupportedOperation(const std::shared_ptr& op, st return true; } -Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, EltwiseShapeInferFactory()), broadcastingPolicy(Undefined) { +Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, EltwiseShapeInferFactory()), + broadcastingPolicy(Undefined) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -2098,67 +2201,68 @@ Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr c size_t Eltwise::getOpInputsNum() const { switch (getAlgorithm()) { - case Algorithm::EltwiseIsFinite: - case Algorithm::EltwiseIsInf: - case Algorithm::EltwiseIsNaN: - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseGeluTanh: - case Algorithm::EltwiseCeiling: - case Algorithm::EltwiseFloor: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseAbs: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseExp: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseErf: - case Algorithm::EltwiseLogicalNot: - case Algorithm::EltwisePowerStatic: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - case Algorithm::EltwiseMish: - case Algorithm::EltwiseHsigmoid: - case Algorithm::EltwiseRoundHalfToEven: - case Algorithm::EltwiseRoundHalfAwayFromZero: - case Algorithm::EltwiseSoftSign: - case Algorithm::EltwiseLog: - return 1; - case Algorithm::EltwiseAdd: - case Algorithm::EltwiseSubtract: - case Algorithm::EltwiseMultiply: - case Algorithm::EltwiseDivide: - case Algorithm::EltwiseFloorMod: - case Algorithm::EltwiseMod: - case Algorithm::EltwiseMaximum: - case Algorithm::EltwiseMinimum: - case Algorithm::EltwiseSquaredDifference: - case Algorithm::EltwisePowerDynamic: - case Algorithm::EltwiseEqual: - case Algorithm::EltwiseNotEqual: - case Algorithm::EltwiseGreater: - case Algorithm::EltwiseGreaterEqual: - case Algorithm::EltwiseLess: - case Algorithm::EltwiseLessEqual: - case Algorithm::EltwiseLogicalAnd: - case Algorithm::EltwiseLogicalOr: - case Algorithm::EltwiseLogicalXor: - case Algorithm::EltwiseBitwiseAnd: - case Algorithm::EltwiseBitwiseOr: - case Algorithm::EltwiseBitwiseXor: - case Algorithm::EltwiseBitwiseLeftShift: - case Algorithm::EltwiseBitwiseRightShift: - return 2; - case Algorithm::EltwiseBitwiseNot: - return 1; - case Algorithm::EltwisePrelu: - return 2; - case Algorithm::EltwiseMulAdd: - case Algorithm::EltwiseSelect: - return 3; - default: OPENVINO_THROW("Unsupported operation for Eltwise node with name `", getName(), "`."); + case Algorithm::EltwiseIsFinite: + case Algorithm::EltwiseIsInf: + case Algorithm::EltwiseIsNaN: + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseGeluTanh: + case Algorithm::EltwiseCeiling: + case Algorithm::EltwiseFloor: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseAbs: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseExp: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseErf: + case Algorithm::EltwiseLogicalNot: + case Algorithm::EltwisePowerStatic: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + case Algorithm::EltwiseMish: + case Algorithm::EltwiseHsigmoid: + case Algorithm::EltwiseRoundHalfToEven: + case Algorithm::EltwiseRoundHalfAwayFromZero: + case Algorithm::EltwiseSoftSign: + case Algorithm::EltwiseLog: + return 1; + case Algorithm::EltwiseAdd: + case Algorithm::EltwiseSubtract: + case Algorithm::EltwiseMultiply: + case Algorithm::EltwiseDivide: + case Algorithm::EltwiseFloorMod: + case Algorithm::EltwiseMod: + case Algorithm::EltwiseMaximum: + case Algorithm::EltwiseMinimum: + case Algorithm::EltwiseSquaredDifference: + case Algorithm::EltwisePowerDynamic: + case Algorithm::EltwiseEqual: + case Algorithm::EltwiseNotEqual: + case Algorithm::EltwiseGreater: + case Algorithm::EltwiseGreaterEqual: + case Algorithm::EltwiseLess: + case Algorithm::EltwiseLessEqual: + case Algorithm::EltwiseLogicalAnd: + case Algorithm::EltwiseLogicalOr: + case Algorithm::EltwiseLogicalXor: + case Algorithm::EltwiseBitwiseAnd: + case Algorithm::EltwiseBitwiseOr: + case Algorithm::EltwiseBitwiseXor: + case Algorithm::EltwiseBitwiseLeftShift: + case Algorithm::EltwiseBitwiseRightShift: + return 2; + case Algorithm::EltwiseBitwiseNot: + return 1; + case Algorithm::EltwisePrelu: + return 2; + case Algorithm::EltwiseMulAdd: + case Algorithm::EltwiseSelect: + return 3; + default: + OPENVINO_THROW("Unsupported operation for Eltwise node with name `", getName(), "`."); } } @@ -2183,40 +2287,37 @@ void Eltwise::getSupportedDescriptors() { void Eltwise::initSupportedPrimitiveDescriptors() { const auto isBitwise = [](const Algorithm& algorithm) { - return one_of( - algorithm, - Algorithm::EltwiseBitwiseAnd, - Algorithm::EltwiseBitwiseNot, - Algorithm::EltwiseBitwiseOr, - Algorithm::EltwiseBitwiseXor, - Algorithm::EltwiseBitwiseLeftShift, - Algorithm::EltwiseBitwiseRightShift); + return one_of(algorithm, + Algorithm::EltwiseBitwiseAnd, + Algorithm::EltwiseBitwiseNot, + Algorithm::EltwiseBitwiseOr, + Algorithm::EltwiseBitwiseXor, + Algorithm::EltwiseBitwiseLeftShift, + Algorithm::EltwiseBitwiseRightShift); }; - std::vector supportedPrecisions = isBitwise(algorithm) ? - std::vector { - ov::element::u8, - ov::element::i8, - ov::element::u16, - ov::element::i16, - ov::element::i32 - } : std::vector { - ov::element::f32, - ov::element::u8, - ov::element::i8, - ov::element::u16, - ov::element::i16, - ov::element::bf16, - ov::element::f16, - ov::element::i32 - }; + std::vector supportedPrecisions = isBitwise(algorithm) + ? std::vector{ov::element::u8, + ov::element::i8, + ov::element::u16, + ov::element::i16, + ov::element::i32} + : std::vector{ov::element::f32, + ov::element::u8, + ov::element::i8, + ov::element::u16, + ov::element::i16, + ov::element::bf16, + ov::element::f16, + ov::element::i32}; if (!supportedPrimitiveDescriptors.empty()) return; - // if dim rank is greater than the maximum possible, we should use the reference execution -#if defined (OPENVINO_ARCH_ARM64) - bool canUseOptimizedImpl = mayiuse(dnnl::impl::cpu::aarch64::asimd) && (getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK); + // if dim rank is greater than the maximum possible, we should use the reference execution +#if defined(OPENVINO_ARCH_ARM64) + bool canUseOptimizedImpl = + mayiuse(dnnl::impl::cpu::aarch64::asimd) && (getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK); bool canUseOptimizedShapeAgnosticImpl = isDynamicNode() && canUseOptimizedImpl; #else bool canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK; @@ -2261,7 +2362,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { ")"); std::vector inputPrecisions; - for (const auto &prec : getOriginalInputPrecisions()) { + for (const auto& prec : getOriginalInputPrecisions()) { inputPrecisions.push_back(prec); } @@ -2288,31 +2389,32 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } #ifndef OPENVINO_ARCH_ARM64 - implType = canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : - canUseOptimizedImpl ? EltwiseImplType::optimized : EltwiseImplType::reference; + implType = canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic + : canUseOptimizedImpl ? EltwiseImplType::optimized + : EltwiseImplType::reference; if (!hasHardwareSupport(ov::element::bf16)) { bool hasBF16 = false; - for (auto &inPrc : inputPrecisions) + for (auto& inPrc : inputPrecisions) if (inPrc == ov::element::bf16) hasBF16 = true; if (outputPrecision == ov::element::bf16 || hasBF16) OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support BF16 precision on this target."); } -#if defined(OV_CPU_WITH_ACL) +# if defined(OV_CPU_WITH_ACL) const bool useJit = false; -#endif +# endif #elif defined(OPENVINO_ARCH_ARM64) - const bool useJit = canUseOptimizedImpl && - jitIsSupported(this, getAlpha(), getBeta(), getGamma()); + const bool useJit = canUseOptimizedImpl && jitIsSupported(this, getAlpha(), getBeta(), getGamma()); if (!useJit) { canUseOptimizedImpl = false; } - implType = (useJit && canUseOptimizedImpl) ? - (canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : EltwiseImplType::optimized) : - EltwiseImplType::reference; + implType = + (useJit && canUseOptimizedImpl) + ? (canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : EltwiseImplType::optimized) + : EltwiseImplType::reference; #else OPENVINO_THROW("Unknow CPU architecture"); #endif @@ -2330,66 +2432,74 @@ void Eltwise::initSupportedPrimitiveDescriptors() { const bool useAcl = !useJit; if (useAcl) { - // Use original output precision as a reference point since some eltwise algorithms have non-float inputs (i.e. EltwiseSelect) - ov::element::Type forcedPrec = getOriginalOutputPrecisionAtPort(0) == ov::element::f16 ? ov::element::f16 : ov::element::f32; - // ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one - if (AclEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { - for (size_t i = 0; i < getParentEdges().size(); i++) { - if (!getParentEdgeAt(i)->getParent()->isConstant()) { - if (getOriginalInputPrecisionAtPort(i).size() > forcedPrec.size()) { - forcedPrec = getOriginalInputPrecisionAtPort(i); + // Use original output precision as a reference point since some eltwise algorithms have non-float inputs (i.e. + // EltwiseSelect) + ov::element::Type forcedPrec = + getOriginalOutputPrecisionAtPort(0) == ov::element::f16 ? ov::element::f16 : ov::element::f32; + // ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one + if (AclEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + if (!getParentEdgeAt(i)->getParent()->isConstant()) { + if (getOriginalInputPrecisionAtPort(i).size() > forcedPrec.size()) { + forcedPrec = getOriginalInputPrecisionAtPort(i); + } } } + if (!forcedPrec.is_real()) { + forcedPrec = ov::element::f32; + } } - if (!forcedPrec.is_real()) { - forcedPrec = ov::element::f32; - } - } - for (size_t i = 0; i < inputPrecisions.size(); i++) { - inputPrecisions[i] = filterPrecision(inputPrecisions[i], forcedPrec); - } - outputPrecision = filterPrecision(outputPrecision, forcedPrec); - } else { -#endif -#if defined(OV_CPU_WITH_SHL) - if (ShlEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { - // SHL implementation supports only identical precisions on inputs/outputs and only FP32 for now - const ov::element::Type forcedPrec = ov::element::f32; for (size_t i = 0; i < inputPrecisions.size(); i++) { - inputPrecisions[i] = forcedPrec; + inputPrecisions[i] = filterPrecision(inputPrecisions[i], forcedPrec); } - outputPrecision = forcedPrec; + outputPrecision = filterPrecision(outputPrecision, forcedPrec); } else { #endif - auto filterPrecision = [&](const ov::element::Type& prc) { - if (implType == EltwiseImplType::reference) { - if (isBitwise(algorithm)) { - if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) { - OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support ", prc, " precision."); - } - return prc; - } - return ov::element::f32; - } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) { - if (prc == ov::element::u32 || prc == ov::element::i64 || prc == ov::element::u64) { - return ov::element::i32; - } else if (prc == ov::element::f64) { - return ov::element::f32; - } else { - OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support ", prc, " precision."); +#if defined(OV_CPU_WITH_SHL) + if (ShlEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { + // SHL implementation supports only identical precisions on inputs/outputs and only FP32 for now + const ov::element::Type forcedPrec = ov::element::f32; + for (size_t i = 0; i < inputPrecisions.size(); i++) { + inputPrecisions[i] = forcedPrec; } + outputPrecision = forcedPrec; } else { - return prc; - } - }; +#endif + auto filterPrecision = [&](const ov::element::Type& prc) { + if (implType == EltwiseImplType::reference) { + if (isBitwise(algorithm)) { + if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == + supportedPrecisions.end()) { + OPENVINO_THROW("Eltwise node with name `", + getName(), + "` doesn't support ", + prc, + " precision."); + } + return prc; + } + return ov::element::f32; + } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == + supportedPrecisions.end()) { + if (prc == ov::element::u32 || prc == ov::element::i64 || prc == ov::element::u64) { + return ov::element::i32; + } else if (prc == ov::element::f64) { + return ov::element::f32; + } else { + OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support ", prc, " precision."); + } + } else { + return prc; + } + }; - for (size_t i = 0; i < inputPrecisions.size(); i++) { - inputPrecisions[i] = filterPrecision(inputPrecisions[i]); - } - outputPrecision = filterPrecision(outputPrecision); + for (size_t i = 0; i < inputPrecisions.size(); i++) { + inputPrecisions[i] = filterPrecision(inputPrecisions[i]); + } + outputPrecision = filterPrecision(outputPrecision); #if defined(OV_CPU_WITH_SHL) - } + } #endif #if defined(OV_CPU_WITH_ACL) } @@ -2398,22 +2508,19 @@ void Eltwise::initSupportedPrimitiveDescriptors() { // TODO: delete after new LPT (ngraph based) is merged // WA is needed to handle bug in LPT that produces wrong precision after average pooling (I8/U8 instead of FP32) if ((getAlgorithm() == Algorithm::EltwiseMulAdd || getAlgorithm() == Algorithm::EltwisePowerStatic) && - (inputPrecisions[0] == ov::element::u8 || inputPrecisions[0] == ov::element::i8)) { + (inputPrecisions[0] == ov::element::u8 || inputPrecisions[0] == ov::element::i8)) { auto parentNode = getParentEdgeAt(0)->getParent(); if (getParentEdgeAt(0)->getParent()->getAlgorithm() == Algorithm::PoolingAvg) { inputPrecisions[0] = ov::element::f32; } } - enum LayoutType { - Planar, - ChannelsFirst, - Blocked - }; + enum LayoutType { Planar, ChannelsFirst, Blocked }; - auto initDesc = [&] (LayoutType lt, const bool useEltwiseExecutor = false, const bool useJit = false) -> NodeDesc { - auto createMemoryDesc = [lt](const Shape &shape, ov::element::Type prc, size_t offset) -> std::shared_ptr { - const auto &dims = shape.getDims(); + auto initDesc = [&](LayoutType lt, const bool useEltwiseExecutor = false, const bool useJit = false) -> NodeDesc { + auto createMemoryDesc = + [lt](const Shape& shape, ov::element::Type prc, size_t offset) -> std::shared_ptr { + const auto& dims = shape.getDims(); if (lt == ChannelsFirst && shape.getRank() != 1) { auto ndims = shape.getRank(); VectorDims order(ndims); @@ -2429,10 +2536,11 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } return std::make_shared(prc, shape, blocks, order, offset); - // TODO: need investigate - // bad accuracy for shape {1, 1, 4, 11}, {2, 5, 1, 1} - // same for disabled collapse dims - } else if (lt == Blocked && shape.getRank() != 1 && (shape.getMinDims()[1] != Shape::UNDEFINED_DIM && shape.getMinDims()[1] > 1)) { + // TODO: need investigate + // bad accuracy for shape {1, 1, 4, 11}, {2, 5, 1, 1} + // same for disabled collapse dims + } else if (lt == Blocked && shape.getRank() != 1 && + (shape.getMinDims()[1] != Shape::UNDEFINED_DIM && shape.getMinDims()[1] > 1)) { size_t blockSize = dnnl::impl::cpu::x64::mayiuse(x64::avx512_core) ? 16 : 8; VectorDims blocks = dims; VectorDims order(blocks.size()); @@ -2463,9 +2571,9 @@ void Eltwise::initSupportedPrimitiveDescriptors() { portConfig.inPlace((!i && canBeInPlace() && inputPrecisions[i] == outputPrecision) ? 0 : -1); portConfig.constant(false); - const auto &srcShape = getInputShapeAtPort(i); + const auto& srcShape = getInputShapeAtPort(i); if (!isDynamicNode() && srcShape.getDims()[0] == 1) { - inputMask.reset(0); // accepts any stride on the batch axis + inputMask.reset(0); // accepts any stride on the batch axis } portConfig.setMemDesc(createMemoryDesc(srcShape, inputPrecisions[i], offset), inputMask); @@ -2476,10 +2584,10 @@ void Eltwise::initSupportedPrimitiveDescriptors() { portConfig.inPlace(-1); portConfig.constant(false); - const auto &dstShape = getOutputShapeAtPort(0); + const auto& dstShape = getOutputShapeAtPort(0); BlockedMemoryDesc::CmpMask outputMask = BlockedMemoryDesc::SKIP_OFFSET_MASK; if (!isDynamicNode() && dstShape.getDims()[0] == 1) { - outputMask.reset(0); // accepts any stride on the batch axis + outputMask.reset(0); // accepts any stride on the batch axis } portConfig.setMemDesc(createMemoryDesc(dstShape, outputPrecision, offset), outputMask); @@ -2487,13 +2595,13 @@ void Eltwise::initSupportedPrimitiveDescriptors() { if (useEltwiseExecutor || useJit) { impl_desc_type impl_type; - #if defined (OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_ARM64) if (useJit) { impl_type = impl_desc_type::jit_asimd; } - #else +#else impl_type = impl_desc_type::undef; - #endif +#endif std::vector srcMemoryDescs; for (size_t i = 0; i < config.inConfs.size(); i++) { @@ -2504,20 +2612,23 @@ void Eltwise::initSupportedPrimitiveDescriptors() { dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); } - auto factory = std::make_shared(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getImplPriority())); + auto factory = + std::make_shared(eltwiseAttrs, + srcMemoryDescs, + dstMemoryDescs, + std::make_shared(context, getImplPriority())); return {config, impl_type, !factory->isEmpty() ? factory : nullptr}; } else { impl_desc_type impl_type = impl_desc_type::ref; if (canUseOptimizedImpl) { - #if defined (OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_ARM64) if (mayiuse(dnnl::impl::cpu::aarch64::asimd)) { impl_type = impl_desc_type::jit_asimd; } else { OPENVINO_THROW("not supported architecture"); } - #else +#else if (mayiuse(x64::avx512_core)) { impl_type = impl_desc_type::jit_avx512; } else if (mayiuse(x64::avx2)) { @@ -2525,7 +2636,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } else if (mayiuse(x64::sse41)) { impl_type = impl_desc_type::jit_sse42; } - #endif +#endif } return {config, impl_type}; @@ -2534,10 +2645,11 @@ void Eltwise::initSupportedPrimitiveDescriptors() { bool isChannelsFirstApplicable = one_of(getOutputShapeAtPort(0).getRank(), 1u, 2u, 3u, 4u, 5u); for (size_t i = 0; i < getParentEdges().size(); i++) { - isChannelsFirstApplicable = isChannelsFirstApplicable && one_of(getInputShapeAtPort(i).getRank(), 1u, 2u, 3u, 4u, 5u); - isChannelsFirstApplicable = isChannelsFirstApplicable && implication(getInputShapeAtPort(i).getRank() != 1, - getOutputShapeAtPort(0).getRank() == - getInputShapeAtPort(i).getRank()); + isChannelsFirstApplicable = + isChannelsFirstApplicable && one_of(getInputShapeAtPort(i).getRank(), 1u, 2u, 3u, 4u, 5u); + isChannelsFirstApplicable = isChannelsFirstApplicable && + implication(getInputShapeAtPort(i).getRank() != 1, + getOutputShapeAtPort(0).getRank() == getInputShapeAtPort(i).getRank()); } #if defined(OPENVINO_ARCH_ARM64) @@ -2547,13 +2659,14 @@ void Eltwise::initSupportedPrimitiveDescriptors() { #endif for (size_t i = 0; i < getParentEdges().size(); i++) { - const auto &inShape = getInputShapeAtPort(i); + const auto& inShape = getInputShapeAtPort(i); isBlockedApplicable = isBlockedApplicable && one_of(inShape.getRank(), 1u, 3u, 4u, 5u); - isBlockedApplicable = isBlockedApplicable && implication(inShape.getRank() != 1, - getOutputShapeAtPort(0).getRank() == - inShape.getRank()); + isBlockedApplicable = + isBlockedApplicable && + implication(inShape.getRank() != 1, getOutputShapeAtPort(0).getRank() == inShape.getRank()); if (isDynamicNode() && inShape.getRank() != 1) - isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1; + isBlockedApplicable = + isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1; } inputNum = getParentEdges().size(); @@ -2561,28 +2674,29 @@ void Eltwise::initSupportedPrimitiveDescriptors() { #if defined(OV_CPU_WITH_ACL) if (useAcl || useJit) { - eltwiseAttrs = {algorithm, alpha, beta, gamma}; + eltwiseAttrs = {algorithm, alpha, beta, gamma}; - auto addDesc = [&initDesc, &useJit](std::vector& supportedPrimitiveDescriptors, const LayoutType layoutType) { - auto nodeDesc = initDesc(layoutType, !useJit, useJit); - if (nodeDesc.getExecutorFactory()) - supportedPrimitiveDescriptors.emplace_back(nodeDesc); - }; + auto addDesc = [&initDesc, &useJit](std::vector& supportedPrimitiveDescriptors, + const LayoutType layoutType) { + auto nodeDesc = initDesc(layoutType, !useJit, useJit); + if (nodeDesc.getExecutorFactory()) + supportedPrimitiveDescriptors.emplace_back(nodeDesc); + }; - // @todo should be handled in scope of selectPreferPrimitiveDescriptor - if (context->getConfig().modelType == Config::ModelType::CNN) { - if (isChannelsFirstApplicable) - addDesc(supportedPrimitiveDescriptors, ChannelsFirst); - addDesc(supportedPrimitiveDescriptors, Planar); - } else { - addDesc(supportedPrimitiveDescriptors, Planar); - if (isChannelsFirstApplicable) - addDesc(supportedPrimitiveDescriptors, ChannelsFirst); - } + // @todo should be handled in scope of selectPreferPrimitiveDescriptor + if (context->getConfig().modelType == Config::ModelType::CNN) { + if (isChannelsFirstApplicable) + addDesc(supportedPrimitiveDescriptors, ChannelsFirst); + addDesc(supportedPrimitiveDescriptors, Planar); + } else { + addDesc(supportedPrimitiveDescriptors, Planar); + if (isChannelsFirstApplicable) + addDesc(supportedPrimitiveDescriptors, ChannelsFirst); + } - canUseEltwiseExecPtr = !supportedPrimitiveDescriptors.empty() && !useJit; - if (!supportedPrimitiveDescriptors.empty()) - return; + canUseEltwiseExecPtr = !supportedPrimitiveDescriptors.empty() && !useJit; + if (!supportedPrimitiveDescriptors.empty()) + return; } #endif @@ -2652,15 +2766,18 @@ void Eltwise::prepareParams() { dstMemoryDescs.push_back(getDstMemoryAtPort(0)->getDescPtr()); auto selectedPD = getSelectedPrimitiveDescriptor(); - eltwiseExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs, {}); + eltwiseExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(eltwiseAttrs, + srcMemoryDescs, + dstMemoryDescs, + {}); selectedPD->setImplementationType(eltwiseExecPtr->getImplType()); return; } auto outBlockingDesc = getChildEdgeAt(0)->getMemory().getDescWithType(); - const auto &outOrder = outBlockingDesc->getOrder(); - const auto ¤tOutBlkDims = outBlockingDesc->getBlockDims(); + const auto& outOrder = outBlockingDesc->getOrder(); + const auto& currentOutBlkDims = outBlockingDesc->getBlockDims(); size_t input_size = std::max(static_cast(EltwiseJitExecutor::optimalTensorRank), currentOutBlkDims.size()); @@ -2679,13 +2796,16 @@ void Eltwise::prepareParams() { size_t inRank = currentInBlkDims[i].size(); // WA to normalize blocked and planar layouts - const auto &inOrder = inBlockingDesc->getOrder(); + const auto& inOrder = inBlockingDesc->getOrder(); size_t startOff = outOrder.size() != outBlockingDesc->getShape().getRank() && - outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] ? 1 : 0; + outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] + ? 1 + : 0; // WA to handle nspc layout with 1D tensors if (1 == inRank) { - if (outRank > 2 && 1 == outOrder.back()) startOff = 1; + if (outRank > 2 && 1 == outOrder.back()) + startOff = 1; } for (size_t j = 0; j < inRank; j++) { @@ -2718,14 +2838,18 @@ void Eltwise::prepareParams() { if (!canSkipSearchInCache) { EltwiseData thisOp{getAlgorithm(), getOneDnnAlgorithm(), getAlpha(), getBeta(), getGamma()}; - EltwiseKey key = {{thisOp}, {getType()}, currentOutBlkDims, outOrder, dims_in, inpPrc, outPrc, dnnl::post_ops(), implType}; + EltwiseKey key = + {{thisOp}, {getType()}, currentOutBlkDims, outOrder, dims_in, inpPrc, outPrc, dnnl::post_ops(), implType}; fqDataPtrs.clear(); - for (const auto &node : fusedWith) { + for (const auto& node : fusedWith) { key.ops_list.push_back(node->getType()); if (node->getType() == Type::Eltwise) { if (auto eltwise = std::dynamic_pointer_cast(node)) { - key.eltwise_data.push_back({eltwise->getAlgorithm(), eltwise->getOneDnnAlgorithm(), eltwise->getAlpha(), - eltwise->getBeta(), eltwise->getGamma()}); + key.eltwise_data.push_back({eltwise->getAlgorithm(), + eltwise->getOneDnnAlgorithm(), + eltwise->getAlpha(), + eltwise->getBeta(), + eltwise->getGamma()}); } } else if (node->getType() == Type::FakeQuantize) { node->appendPostOps(key.postOps, {}, fqDataPtrs); @@ -2745,9 +2869,9 @@ void Eltwise::prepareParams() { // update execParams for shape agnostic kernel if (implType == EltwiseImplType::optimizedShapeAgnostic) { - auto &outDims = execParams.outDims; - auto &inOffsets = execParams.inOffsets; - auto &outOffsets = execParams.outOffsets; + auto& outDims = execParams.outDims; + auto& inOffsets = execParams.inOffsets; + auto& outOffsets = execParams.outOffsets; // outDims recalculation outDims.resize(dims_in[0].size(), 1); @@ -2805,7 +2929,8 @@ void Eltwise::selectOptimalPrimitiveDescriptor() { void Eltwise::execute(dnnl::stream strm) { if (execPtr) { jit_eltwise_call_args_ptrs args_ptrs = {}; - VectorDims dims_out = implType == EltwiseImplType::optimizedShapeAgnostic ? execParams.outDims : execPtr->getOutDims(); + VectorDims dims_out = + implType == EltwiseImplType::optimizedShapeAgnostic ? execParams.outDims : execPtr->getOutDims(); for (size_t i = 0; i < memPtrs.size() - 1; i++) args_ptrs.src_ptr[i] = memPtrs[i]->getDataAs() + start_offset_in[i]; args_ptrs.dst_ptr = memPtrs.back()->getDataAs() + start_offset_out; @@ -2873,15 +2998,14 @@ void Eltwise::fuseInto(NodePtr& parentNode) { getAlgorithm() == Algorithm::EltwiseAdd && dimsEqualWeak(getInputShapeAtPort(0).getDims(), getInputShapeAtPort(1).getDims()) && !getParentEdgeAt(0)->getParent()->isConstant() && !getParentEdgeAt(1)->getParent()->isConstant(); - if ((scales.empty() && shifts.empty()) && - !specialConvolutionAddFusing && + if ((scales.empty() && shifts.empty()) && !specialConvolutionAddFusing && canBePerformedAsScaleShift(parentNode.get())) { std::tie(scales, shifts) = getScalesAndShifts(parentNode.get()); } Node::fuseInto(parentNode); } -void Eltwise::appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void Eltwise::appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem) { if (!memPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::f32, {data.size()}); memPtr = std::make_shared(getEngine(), memoryDesc, data.data()); @@ -2889,12 +3013,15 @@ void Eltwise::appendMemory(const std::vector &data, MemoryPtr &memPtr, st } } -void Eltwise::appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void Eltwise::appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem) { postOpsMem.push_back(data.data()); } template -void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis) { +void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis) { const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' "; if (getOneDnnAlgorithm() != dnnl::algorithm::undef) { @@ -2920,7 +3047,8 @@ void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDim case dnnl::algorithm::eltwise_round_half_away_from_zero: ops.append_eltwise(getOneDnnAlgorithm(), getAlpha(), getBeta()); break; - default: OPENVINO_THROW(errorPrefix, "as post operation is not supported"); + default: + OPENVINO_THROW(errorPrefix, "as post operation is not supported"); } } else { // per-tensor EltwisePowerStatic can be implemented with more well-supported eltwise postOps @@ -2938,7 +3066,8 @@ void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDim const auto chIdx = postOpDims.size() > 1 ? channelAxis : 0; channelSize = postOpDims[chIdx]; } - // since legacy depthwise post ops mechanism requires broadcasted data we need to reinitilize it in case of changed shape + // since legacy depthwise post ops mechanism requires broadcasted data we need to reinitilize it in case of + // changed shape if (depthwiseData.empty() || depthwiseDataSize != 2 * channelSize) { depthwiseData.clear(); depthwiseMemory.reset(); @@ -2995,7 +3124,10 @@ void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDim } } -void Eltwise::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis) { +void Eltwise::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis) { std::vector postOpsMemPtrs; appendPostOpsImpl(ops, postOpDims, postOpsMemPtrs, channelAxis); @@ -3006,11 +3138,17 @@ void Eltwise::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, s } } -void Eltwise::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis) { +void Eltwise::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis) { appendPostOpsImpl(ops, postOpDims, postOpsMem, channelAxis); } -bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastPostOp, dnnl::memory::data_type outDataType, bool allowBinary) { +bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, + bool isLastPostOp, + dnnl::memory::data_type outDataType, + bool allowBinary) { const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' as binary post op "; if (getOneDnnAlgorithm() != dnnl::algorithm::undef) { @@ -3039,7 +3177,8 @@ bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastP // call dnnlpoc's specialized API to generate optimized postOps sequence dnnlpoc.appendLinear({getAlpha()}, {getBeta()}, isLastPostOp); break; - default: OPENVINO_THROW(errorPrefix, "as post operation is not supported"); + default: + OPENVINO_THROW(errorPrefix, "as post operation is not supported"); } } else { switch (getAlgorithm()) { @@ -3054,9 +3193,9 @@ bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastP case Algorithm::EltwisePowerStatic: if (beta != 1.0f && gamma != 0.0f) { return dnnlpoc.appendLinear(scales, shifts, isLastPostOp, allowBinary); - } else if (beta != 1.0f) {// Multiply if has scales + } else if (beta != 1.0f) { // Multiply if has scales return dnnlpoc.appendScale(scales, isLastPostOp, allowBinary); - } else if (gamma != 0.0f) {// Add only if has shifts + } else if (gamma != 0.0f) { // Add only if has shifts return dnnlpoc.appendShift(shifts, allowBinary); } break; @@ -3103,16 +3242,17 @@ bool Eltwise::canFuseParent(const NodePtr& parentNode) const { bool Eltwise::canFuse(const NodePtr& node) const { auto isIntegerComputeSupported = [](const Node* node) { - if (!one_of(node->getAlgorithm(), Algorithm::EltwiseAdd, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseMulAdd, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseDivide, - Algorithm::EltwiseSquaredDifference)) { + if (!one_of(node->getAlgorithm(), + Algorithm::EltwiseAdd, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseMulAdd, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseDivide, + Algorithm::EltwiseSquaredDifference)) { return false; } - for (const auto &originalInputPrecision : node->getOriginalInputPrecisions()) { + for (const auto& originalInputPrecision : node->getOriginalInputPrecisions()) { if (originalInputPrecision != ov::element::i32) { return false; } @@ -3121,7 +3261,7 @@ bool Eltwise::canFuse(const NodePtr& node) const { return true; }; -#if defined (OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_ARM64) if (!mayiuse(dnnl::impl::cpu::aarch64::asimd) || (getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK)) return false; @@ -3129,10 +3269,8 @@ bool Eltwise::canFuse(const NodePtr& node) const { return false; } const auto eltwise = dynamic_cast(node.get()); - if ((eltwise == nullptr) || (!jitIsSupported(eltwise, - eltwise->getAlpha(), - eltwise->getBeta(), - eltwise->getGamma()))) { + if ((eltwise == nullptr) || + (!jitIsSupported(eltwise, eltwise->getAlpha(), eltwise->getBeta(), eltwise->getGamma()))) { return false; } #else @@ -3170,29 +3308,30 @@ bool Eltwise::canFuse(const NodePtr& node) const { return false; if (node->getType() == Type::Eltwise) { - // [WA] Since execution precision change from I32 to FP32 for arithmetic operations may lead to incorrect results - // we disable fusing cases which may lead to invalid precision conversions inside the kernel - // [TODO] We need to rewrite support for different precisions at all to avoid implicit conversions to FP32 - // (all should be handled via explicit convert operations) + // [WA] Since execution precision change from I32 to FP32 for arithmetic operations may lead to incorrect + // results we disable fusing cases which may lead to invalid precision conversions inside the kernel [TODO] We + // need to rewrite support for different precisions at all to avoid implicit conversions to FP32 (all should be + // handled via explicit convert operations) bool isIntegerFusingNode = isIntegerComputeSupported(node.get()); - if ((isIntegerNode && !isIntegerFusingNode) || - (!isIntegerNode && isIntegerFusingNode)) { + if ((isIntegerNode && !isIntegerFusingNode) || (!isIntegerNode && isIntegerFusingNode)) { return false; } if (node->getParentEdgeAt(0)->getParent().get() != this) { - // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port. - if (one_of(node->getAlgorithm(), Algorithm::EltwiseSubtract, - Algorithm::EltwiseDivide, - Algorithm::EltwiseFloorMod, - Algorithm::EltwiseMod, - Algorithm::EltwisePowerDynamic, - Algorithm::EltwiseGreater, - Algorithm::EltwiseGreaterEqual, - Algorithm::EltwiseLess, - Algorithm::EltwiseLessEqual, - Algorithm::EltwiseMulAdd, - Algorithm::EltwiseSelect)) { + // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for + // 0-th port. + if (one_of(node->getAlgorithm(), + Algorithm::EltwiseSubtract, + Algorithm::EltwiseDivide, + Algorithm::EltwiseFloorMod, + Algorithm::EltwiseMod, + Algorithm::EltwisePowerDynamic, + Algorithm::EltwiseGreater, + Algorithm::EltwiseGreaterEqual, + Algorithm::EltwiseLess, + Algorithm::EltwiseLessEqual, + Algorithm::EltwiseMulAdd, + Algorithm::EltwiseSelect)) { return false; } @@ -3205,7 +3344,8 @@ bool Eltwise::canFuse(const NodePtr& node) const { } } - // We can use optimized execution with fusions only in cases when dim rank is less or equal to the maximum possible + // We can use optimized execution with fusions only in cases when dim rank is less or equal to the maximum + // possible if (node->getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK) return false; @@ -3224,13 +3364,15 @@ ov::element::Type Eltwise::getRuntimePrecision() const { // Don't take bias precision into account for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); - if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && !parentEdge->getParent()->isConstant()) { - inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); + if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && + !parentEdge->getParent()->isConstant()) { + inputPrecisions.emplace_back( + DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); } } return getMaxPrecision(inputPrecisions); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.h b/src/plugins/intel_cpu/src/nodes/eltwise.h index 6013ce732ee5fc..d0ca94e08824c8 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.h +++ b/src/plugins/intel_cpu/src/nodes/eltwise.h @@ -5,17 +5,18 @@ #pragma once #include + +#include #include #include -#include #include "dnnl_postops_composer_legacy.h" -#include "nodes/executors/eltwise.hpp" #include "executors/eltwise_list.hpp" +#include "nodes/executors/eltwise.hpp" #include "nodes/kernels/jit_eltwise_call_args_ptrs.hpp" #if defined(OPENVINO_ARCH_ARM64) -#include "kernels/aarch64/jit_uni_eltwise_generic.hpp" +# include "kernels/aarch64/jit_uni_eltwise_generic.hpp" #endif namespace ov { @@ -68,18 +69,14 @@ struct jit_uni_eltwise_kernel { #endif -enum class EltwiseImplType { - reference = 0, - optimized = 1, - optimizedShapeAgnostic = 2 -}; +enum class EltwiseImplType { reference = 0, optimized = 1, optimizedShapeAgnostic = 2 }; class Eltwise : public Node { public: class IEltwiseExecutor { public: IEltwiseExecutor() = default; - virtual void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) = 0; + virtual void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) = 0; virtual size_t getBatchDimIdx() const = 0; virtual const VectorDims& getOutDims() const = 0; virtual ~IEltwiseExecutor() = default; @@ -98,22 +95,45 @@ class Eltwise : public Node { bool canBeInPlace() const override; bool canFuseParent(const NodePtr& parentNode) const; bool canFuse(const NodePtr& node) const override; - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis = 1) override; - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis = 1) override; - bool appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastPostOp, dnnl::memory::data_type outDataType, bool allowBinary = true); + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis = 1) override; + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1) override; + bool appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, + bool isLastPostOp, + dnnl::memory::data_type outDataType, + bool allowBinary = true); void fuseInto(NodePtr& parentNode) override; ov::element::Type getRuntimePrecision() const override; - float getAlpha() const { return alpha; } - float getBeta() const { return beta; } - float getGamma() const { return gamma; } - const std::vector& getScales() const { return scales; } - const std::vector& getShifts() const { return shifts; } + float getAlpha() const { + return alpha; + } + float getBeta() const { + return beta; + } + float getGamma() const { + return gamma; + } + const std::vector& getScales() const { + return scales; + } + const std::vector& getShifts() const { + return shifts; + } - dnnl::algorithm getOneDnnAlgorithm() const { return onednnAlgorithm; } + dnnl::algorithm getOneDnnAlgorithm() const { + return onednnAlgorithm; + } bool isWithBroadcast(); - bool isSpecialConvolutionAddFusing() const { return specialConvolutionAddFusing; } + bool isSpecialConvolutionAddFusing() const { + return specialConvolutionAddFusing; + } bool needPrepareParams() const override; void prepareParams() override; @@ -127,7 +147,9 @@ class Eltwise : public Node { Undefined, }; - BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; } + BroadcastingPolicy getBroadcastingPolicy() const { + return broadcastingPolicy; + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -181,10 +203,13 @@ class Eltwise : public Node { size_t getOpInputsNum() const; template - void appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis = 1); + void appendPostOpsImpl(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1); - void appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem); - void appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem); + void appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem); + void appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem); bool canUseEltwiseExecPtr = false; EltwiseAttrs eltwiseAttrs; @@ -201,6 +226,6 @@ class eltwise_precision_helper { static std::set> get_supported_precisions(const Algorithm& algo); }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp index 8b144e90c865bc..2dcb93f9fc6c1b 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp @@ -18,10 +18,10 @@ namespace intel_cpu { namespace node { EmbeddingBag::EmbeddingBag(const std::shared_ptr& op, - size_t requiredInputNum, - size_t indicesIdx, - size_t perSampleWeightsIdx, - size_t defaultIndexIdx) + size_t requiredInputNum, + size_t indicesIdx, + size_t perSampleWeightsIdx, + size_t defaultIndexIdx) : INDICES_IDX(indicesIdx), PER_SAMPLE_WEIGHTS_IDX(perSampleWeightsIdx), DEFAULT_INDEX_IDX(defaultIndexIdx) { @@ -47,9 +47,9 @@ void EmbeddingBag::prepareParams(const VectorDims& indexStaticShape) { template void EmbeddingBag::processData(const T* srcData, - const T* weightsData, - const VectorDims& inDataDims, - const MemoryPtr& outMemory) { + const T* weightsData, + const VectorDims& inDataDims, + const MemoryPtr& outMemory) { std::string msgPrefix = std::string("Node EmbeddingBag with name '") + _layerName + "' "; initFromInputs(); @@ -127,10 +127,10 @@ void EmbeddingBag::processData(const T* srcData, } void EmbeddingBag::execute(const uint8_t* srcData, - const uint8_t* weightsData, - const ov::element::Type& srcPrc, - const VectorDims& inDims, - const MemoryPtr& outMemory) { + const uint8_t* weightsData, + const ov::element::Type& srcPrc, + const VectorDims& inDims, + const MemoryPtr& outMemory) { switch (srcPrc) { case ov::element::f32: { return processData::value_type>( @@ -157,8 +157,7 @@ void EmbeddingBag::execute(const uint8_t* srcData, outMemory); } default: { - OPENVINO_THROW("EmbeddingBag layer does not support precision '" + std::string(srcPrc.get_type_name()) + - "'"); + OPENVINO_THROW("EmbeddingBag layer does not support precision '" + std::string(srcPrc.get_type_name()) + "'"); } } } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag.h b/src/plugins/intel_cpu/src/nodes/embedding_bag.h index 28c8666233fa1a..d804ea06c2b317 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag.h @@ -13,32 +13,32 @@ namespace node { class EmbeddingBag { public: enum class Reduction { SUM, MEAN }; - EmbeddingBag( - const std::shared_ptr&, - size_t requiredInputsNum, - size_t indicesIdx, - size_t perSampleWeightsIdx, - size_t defaultIndexIdx); - - void execute(const uint8_t* srcData, const uint8_t* weightsData, const ov::element::Type &srcPrc, - const VectorDims& inDims, const MemoryPtr& outMemory); + EmbeddingBag(const std::shared_ptr&, + size_t requiredInputsNum, + size_t indicesIdx, + size_t perSampleWeightsIdx, + size_t defaultIndexIdx); + + void execute(const uint8_t* srcData, + const uint8_t* weightsData, + const ov::element::Type& srcPrc, + const VectorDims& inDims, + const MemoryPtr& outMemory); ~EmbeddingBag() = default; protected: virtual void initFromInputs() = 0; - virtual void getIndices( - size_t embIndex, - const int*& indicesRef, - size_t& size, - int& weightsIdx, - bool& withWeights) = 0; + virtual void getIndices(size_t embIndex, + const int*& indicesRef, + size_t& size, + int& weightsIdx, + bool& withWeights) = 0; void prepareParams(const VectorDims& indexStaticShape); - template - void processData(const T* srcData, const T* weightsData, - const VectorDims& inDataDims, const MemoryPtr& outMemory); + template + void processData(const T* srcData, const T* weightsData, const VectorDims& inDataDims, const MemoryPtr& outMemory); const size_t EMB_TABLE_IDX = 0lu; const size_t INDICES_IDX; @@ -51,6 +51,6 @@ class EmbeddingBag { std::string _layerName; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp index b5fbaee982808d..8da557a823a948 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp @@ -2,24 +2,27 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "embedding_bag_offsets.h" + #include -#include #include -#include "embedding_bag_offsets.h" -#include "openvino/op/embeddingbag_offsets_sum.hpp" -#include "openvino/op/embeddingbag_offsets.hpp" +#include +#include "openvino/op/embeddingbag_offsets.hpp" +#include "openvino/op/embeddingbag_offsets_sum.hpp" namespace ov { namespace intel_cpu { namespace node { -bool EmbeddingBagOffset::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool EmbeddingBagOffset::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto embBagOffsetSumOp = ov::as_type_ptr(op); const auto embBagOffsetOp = ov::as_type_ptr(op); if (!embBagOffsetSumOp && !embBagOffsetOp) { - errorMessage = "Node is not an instance of the v3::EmbeddingBagOffsetsSum or v15::EmbeddingBagOffsets operation."; + errorMessage = + "Node is not an instance of the v3::EmbeddingBagOffsetsSum or v15::EmbeddingBagOffsets operation."; return false; } } catch (...) { @@ -46,7 +49,8 @@ EmbeddingBagOffset::EmbeddingBagOffset(const std::shared_ptr& op, cons _reduction = Reduction::MEAN; break; default: - THROW_CPU_NODE_ERR("EmbeddingBagOffsets does not support reduction mode: ", ov::as_string(offsets_op->get_reduction())); + THROW_CPU_NODE_ERR("EmbeddingBagOffsets does not support reduction mode: ", + ov::as_string(offsets_op->get_reduction())); } } if (getInputShapeAtPort(INDICES_IDX).getRank() != 1ul) @@ -61,8 +65,10 @@ void EmbeddingBagOffset::initSupportedPrimitiveDescriptors() { return; std::string logPrefix = std::string("Layer EmbeddingBag with name '") + _layerName + "' "; - static const std::set supportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set supportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; auto inDataPrecision = getOriginalInputPrecisionAtPort(EMB_TABLE_IDX); if (one_of(inDataPrecision, ov::element::bf16, ov::element::f16)) @@ -71,8 +77,10 @@ void EmbeddingBagOffset::initSupportedPrimitiveDescriptors() { if (supportedPrecisions.find(inDataPrecision) == supportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } else { - static const std::set defaultSupportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set defaultSupportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; if (defaultSupportedPrecisions.find(inDataPrecision) == defaultSupportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } @@ -103,7 +111,11 @@ void EmbeddingBagOffset::initFromInputs() { } } -void EmbeddingBagOffset::getIndices(size_t embIndex, const int*& indices, size_t& size, int& weightsIdx, bool& withWeight) { +void EmbeddingBagOffset::getIndices(size_t embIndex, + const int*& indices, + size_t& size, + int& weightsIdx, + bool& withWeight) { if (static_cast(embIndex) >= _offsetsLen) { OPENVINO_THROW("Invalid embedding bag index."); } @@ -145,20 +157,23 @@ bool EmbeddingBagOffset::isExecutable() const { } void EmbeddingBagOffset::execute(dnnl::stream strm) { - const auto *srcData = getSrcDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(0); const uint8_t* weightsData = nullptr; if (_withWeights) weightsData = getSrcDataAtPortAs(PER_SAMPLE_WEIGHTS_IDX); - const auto &inputMem = getParentEdgeAt(0)->getMemory(); - EmbeddingBag::execute(srcData, weightsData, inputMem.getDesc().getPrecision(), - inputMem.getStaticDims(), getDstMemoryAtPort(0)); + const auto& inputMem = getParentEdgeAt(0)->getMemory(); + EmbeddingBag::execute(srcData, + weightsData, + inputMem.getDesc().getPrecision(), + inputMem.getStaticDims(), + getDstMemoryAtPort(0)); } bool EmbeddingBagOffset::created() const { return getType() == Type::EmbeddingBagOffsets; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h index a31b518e7891a9..f8a28152a26642 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h @@ -15,7 +15,7 @@ class EmbeddingBagOffset : public Node, public EmbeddingBag { public: EmbeddingBagOffset(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -41,6 +41,6 @@ class EmbeddingBagOffset : public Node, public EmbeddingBag { size_t _offsetsLen = 0; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp index fd2e0b6141f1fc..c1a06835a67af3 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp @@ -2,23 +2,27 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "embedding_bag_packed.h" + #include -#include #include -#include "embedding_bag_packed.h" -#include "openvino/op/embeddingbag_packedsum.hpp" +#include + #include "openvino/op/embeddingbag_packed.hpp" +#include "openvino/op/embeddingbag_packedsum.hpp" namespace ov { namespace intel_cpu { namespace node { -bool EmbeddingBagPacked::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool EmbeddingBagPacked::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto embBagPackedSumOp = ov::as_type_ptr(op); const auto embBagPackedOp = ov::as_type_ptr(op); if (!embBagPackedSumOp && !embBagPackedOp) { - errorMessage = "Node is not an instance of the v3::EmbeddingBagPackedSum or v15::EmbeddingBagPacked operations."; + errorMessage = + "Node is not an instance of the v3::EmbeddingBagPackedSum or v15::EmbeddingBagPacked operations."; return false; } } catch (...) { @@ -45,7 +49,8 @@ EmbeddingBagPacked::EmbeddingBagPacked(const std::shared_ptr& op, cons _reduction = Reduction::MEAN; break; default: - THROW_CPU_NODE_ERR("EmbeddingBagPacked does not support reduction mode: ", ov::as_string(packed_op->get_reduction())); + THROW_CPU_NODE_ERR("EmbeddingBagPacked does not support reduction mode: ", + ov::as_string(packed_op->get_reduction())); } } if (getInputShapeAtPort(INDICES_IDX).getRank() != 2ul) @@ -57,8 +62,10 @@ void EmbeddingBagPacked::initSupportedPrimitiveDescriptors() { return; std::string logPrefix = std::string("Layer EmbeddingBag with name '") + _layerName + "' "; - static const std::set supportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set supportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; auto inDataPrecision = getOriginalInputPrecisionAtPort(EMB_TABLE_IDX); if (one_of(inDataPrecision, ov::element::bf16, ov::element::f16)) @@ -67,14 +74,16 @@ void EmbeddingBagPacked::initSupportedPrimitiveDescriptors() { if (supportedPrecisions.find(inDataPrecision) == supportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } else { - static const std::set defaultSupportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set defaultSupportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; if (defaultSupportedPrecisions.find(inDataPrecision) == defaultSupportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } - std::vector inDataConfigurators({{LayoutType::ncsp, inDataPrecision}, - {LayoutType::ncsp, ov::element::i32}}); + std::vector inDataConfigurators( + {{LayoutType::ncsp, inDataPrecision}, {LayoutType::ncsp, ov::element::i32}}); if (inputShapes.size() > PER_SAMPLE_WEIGHTS_IDX) inDataConfigurators.push_back({LayoutType::ncsp, inDataPrecision}); @@ -91,7 +100,11 @@ void EmbeddingBagPacked::initFromInputs() { _indices = getSrcDataAtPortAs(INDICES_IDX); } -void EmbeddingBagPacked::getIndices(size_t embIndex, const int*& indices, size_t& size, int& weightsIdx, bool& withWeight) { +void EmbeddingBagPacked::getIndices(size_t embIndex, + const int*& indices, + size_t& size, + int& weightsIdx, + bool& withWeight) { if (static_cast(embIndex) >= _batch * _indicesPerBag) OPENVINO_THROW("Invalid embedding bag index."); @@ -112,20 +125,23 @@ bool EmbeddingBagPacked::isExecutable() const { } void EmbeddingBagPacked::execute(dnnl::stream strm) { - const auto *srcData = getSrcDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(0); const uint8_t* weightsData = nullptr; if (_withWeights) weightsData = getSrcDataAtPortAs(PER_SAMPLE_WEIGHTS_IDX); - const auto &inputMem = getParentEdgeAt(0)->getMemory(); - EmbeddingBag::execute(srcData, weightsData, inputMem.getDesc().getPrecision(), - inputMem.getStaticDims(), getDstMemoryAtPort(0)); + const auto& inputMem = getParentEdgeAt(0)->getMemory(); + EmbeddingBag::execute(srcData, + weightsData, + inputMem.getDesc().getPrecision(), + inputMem.getStaticDims(), + getDstMemoryAtPort(0)); } bool EmbeddingBagPacked::created() const { return getType() == Type::EmbeddingBagPacked; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h index 6a9d33fe3afccb..a018d1b48929e1 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h @@ -15,7 +15,7 @@ class EmbeddingBagPacked : public Node, public EmbeddingBag { public: EmbeddingBagPacked(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -36,6 +36,6 @@ class EmbeddingBagPacked : public Node, public EmbeddingBag { size_t _indicesPerBag = 0; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp index 2a012c6b941831..8bd91799834bad 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp @@ -2,17 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "embedding_segments_sum.h" + #include -#include #include -#include "embedding_segments_sum.h" +#include + #include "openvino/opsets/opset3.hpp" namespace ov { namespace intel_cpu { namespace node { -bool EmbeddingSegmentsSum::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool EmbeddingSegmentsSum::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto embBagSegSumOp = ov::as_type_ptr(op); if (!embBagSegSumOp) { @@ -46,8 +49,10 @@ void EmbeddingSegmentsSum::initSupportedPrimitiveDescriptors() { return; std::string logPrefix = std::string("Layer EmbeddingBag with name '") + _layerName + "' "; - static const std::set supportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set supportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; auto inDataPrecision = getOriginalInputPrecisionAtPort(EMB_TABLE_IDX); if (one_of(inDataPrecision, ov::element::bf16, ov::element::f16)) @@ -56,8 +61,10 @@ void EmbeddingSegmentsSum::initSupportedPrimitiveDescriptors() { if (supportedPrecisions.find(inDataPrecision) == supportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } else { - static const std::set defaultSupportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set defaultSupportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; if (defaultSupportedPrecisions.find(inDataPrecision) == defaultSupportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } @@ -90,7 +97,11 @@ void EmbeddingSegmentsSum::initFromInputs() { } } -void EmbeddingSegmentsSum::getIndices(size_t embIndex, const int*& indices, size_t& size, int& weightsIdx, bool& withWeight) { +void EmbeddingSegmentsSum::getIndices(size_t embIndex, + const int*& indices, + size_t& size, + int& weightsIdx, + bool& withWeight) { if (embIndex >= static_cast(lastNumSegments_)) OPENVINO_THROW("Invalid embedding bag index."); @@ -143,20 +154,23 @@ bool EmbeddingSegmentsSum::isExecutable() const { } void EmbeddingSegmentsSum::execute(dnnl::stream strm) { - const auto *srcData = getSrcDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(0); const uint8_t* weightsData = nullptr; if (_withWeights) weightsData = getSrcDataAtPortAs(PER_SAMPLE_WEIGHTS_IDX); - const auto &inputMem = getParentEdgeAt(0)->getMemory(); - EmbeddingBag::execute(srcData, weightsData, inputMem.getDesc().getPrecision(), - inputMem.getStaticDims(), getDstMemoryAtPort(0)); + const auto& inputMem = getParentEdgeAt(0)->getMemory(); + EmbeddingBag::execute(srcData, + weightsData, + inputMem.getDesc().getPrecision(), + inputMem.getStaticDims(), + getDstMemoryAtPort(0)); } bool EmbeddingSegmentsSum::created() const { return getType() == Type::EmbeddingSegmentsSum; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h index bb312b4dd47246..984b9de68690b2 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h @@ -15,7 +15,7 @@ class EmbeddingSegmentsSum : public Node, public EmbeddingBag { public: EmbeddingSegmentsSum(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -45,6 +45,6 @@ class EmbeddingSegmentsSum : public Node, public EmbeddingBag { size_t indicesSize_ = 0; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 660db85cd61529..86d090a858fd7b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -3,6 +3,7 @@ // #include "jit_eltwise.hpp" + #include namespace ov { @@ -10,13 +11,12 @@ namespace intel_cpu { namespace executors { namespace aarch64 { -bool JitEltwiseExecutor::isSupported( - const Algorithm& algorithm, - const std::vector& input_precisions, - const std::vector& output_precisions, - const float alpha, - const float beta, - const float gamma) { +bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm, + const std::vector& input_precisions, + const std::vector& output_precisions, + const float alpha, + const float beta, + const float gamma) { const auto is_supported = one_of(algorithm, Algorithm::EltwiseAbs, Algorithm::EltwiseAdd, @@ -26,6 +26,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseEqual, Algorithm::EltwiseExp, Algorithm::EltwiseFloor, + Algorithm::EltwiseFloorMod, Algorithm::EltwiseCeiling, Algorithm::EltwiseGeluErf, Algorithm::EltwiseGeluTanh, @@ -37,6 +38,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseIsNaN, Algorithm::EltwiseLessEqual, Algorithm::EltwiseLogicalAnd, + Algorithm::EltwiseLogicalOr, Algorithm::EltwiseLogicalNot, Algorithm::EltwiseLogicalXor, Algorithm::EltwiseMaximum, @@ -48,6 +50,8 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwisePowerStatic, Algorithm::EltwisePrelu, Algorithm::EltwiseRelu, + Algorithm::EltwiseRoundHalfAwayFromZero, + Algorithm::EltwiseRoundHalfToEven, Algorithm::EltwiseSelect, Algorithm::EltwiseSigmoid, Algorithm::EltwiseSoftSign, @@ -63,10 +67,9 @@ bool JitEltwiseExecutor::isSupported( return false; } - const auto check_precisions = []( - const std::vector& input_precisions, - const std::vector& output_precisions, - const std::set& supported_precisions) { + const auto check_precisions = [](const std::vector& input_precisions, + const std::vector& output_precisions, + const std::set& supported_precisions) { if (std::any_of(input_precisions.begin(), input_precisions.end(), [&supported_precisions](const ov::element::Type& precision) { @@ -88,15 +91,13 @@ bool JitEltwiseExecutor::isSupported( const std::set supported_precisions = // Divide and Floor (issue #138629) operations are supported for fp32 and fp16 only. - ((algorithm == Algorithm::EltwiseDivide) || (algorithm == Algorithm::EltwiseFloor)) ? - std::set { ov::element::f16, ov::element::f32 } : - std::set { - ov::element::f16, - ov::element::f32, - ov::element::i32, - ov::element::i8, - ov::element::u8 - }; + ((algorithm == Algorithm::EltwiseDivide) || (algorithm == Algorithm::EltwiseFloor)) + ? std::set{ov::element::f16, ov::element::f32} + : std::set{ov::element::f16, + ov::element::f32, + ov::element::i32, + ov::element::i8, + ov::element::u8}; if (!check_precisions(input_precisions, output_precisions, supported_precisions)) { return false; @@ -107,20 +108,20 @@ bool JitEltwiseExecutor::isSupported( JitEltwiseExecutor::JitEltwiseExecutor(const ExecutorContext::CPtr context) : EltwiseExecutor(context) {} -bool JitEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs, - const std::vector &postOps) { +bool JitEltwiseExecutor::init(const EltwiseAttrs& eltwiseAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const std::vector& postOps) { return true; } -void JitEltwiseExecutor::exec(const std::vector &src, - const std::vector &dst, - const void *post_ops_data_) { +void JitEltwiseExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { exec_func(); } -} // namespace aarch64 -} // namespace executors -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace executors +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp index 5244a80b542fe5..adaaea6a738c7a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_types.h" -#include "nodes/executors/eltwise.hpp" #include "node.h" +#include "nodes/executors/eltwise.hpp" namespace ov { namespace intel_cpu { @@ -17,13 +17,12 @@ class JitEltwiseExecutor : public EltwiseExecutor { public: explicit JitEltwiseExecutor(const ExecutorContext::CPtr context); - static bool isSupported( - const Algorithm& algorithm, - const std::vector& input_precisions, - const std::vector& output_precisions, - const float alpha, - const float beta, - const float gamma); + static bool isSupported(const Algorithm& algorithm, + const std::vector& input_precisions, + const std::vector& output_precisions, + const float alpha, + const float beta, + const float gamma); bool init(const EltwiseAttrs& eltwiseAttrs, const std::vector& srcDescs, @@ -32,7 +31,7 @@ class JitEltwiseExecutor : public EltwiseExecutor { void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return impl_desc_type::asimd; @@ -42,7 +41,7 @@ class JitEltwiseExecutor : public EltwiseExecutor { std::function exec_func; }; -} // namespace aarch64 -} // namespace executors -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace executors +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp index 646cf47c1bcf6c..23933d7e7563b3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp @@ -3,6 +3,7 @@ // #include "acl_common_executor.hpp" + #include "acl_utils.hpp" #include "nodes/executors/memory_arguments.hpp" #include "utils/debug_capabilities.h" @@ -10,16 +11,15 @@ namespace ov { namespace intel_cpu { -static const std::unordered_map argConvert = { - {ARG_SRC_0, ACL_SRC_0}, - {ARG_SRC_1, ACL_SRC_1}, - {ARG_SRC_2, ACL_SRC_2}, - {ARG_BIAS, ACL_BIAS}, - {ARG_WEI, ACL_WEI}, - {ARG_DST, ACL_DST}, -}; +static const std::unordered_map argConvert = {{ARG_SRC_0, ACL_SRC_0}, + {ARG_SRC_1, ACL_SRC_1}, + {ARG_SRC_2, ACL_SRC_2}, + {ARG_BIAS, ACL_BIAS}, + {ARG_WEI, ACL_WEI}, + {ARG_DST, ACL_DST}, + {ARG_DST_DEQ_SCALE, ACL_DST_DEQ_SCALE}}; -using ACLTypes = std::array; +using ACLTypes = std::array; using ACLLayouts = std::array; static void initACLTensorParams(const MemoryPtr& memoryPtr, @@ -38,15 +38,12 @@ static void initACLTensorParams(const MemoryPtr& memoryPtr, } } -static std::shared_ptr initTensorInfo(const arm_compute::TensorShape& tensorShape, - const arm_compute::DataType& dataType, - const arm_compute::DataLayout& dataLayout) { +std::shared_ptr ACLCommonExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) { std::shared_ptr aclMemoryInfo = nullptr; if (dataType != arm_compute::DataType::UNKNOWN) { - aclMemoryInfo = std::make_shared( - tensorShape, 1, - dataType, - dataLayout); + aclMemoryInfo = std::make_shared(tensorShape, 1, dataType, dataLayout); } return aclMemoryInfo; } @@ -66,14 +63,18 @@ ACLCommonExecutor::ACLCommonExecutor() { } } -bool ACLCommonExecutor::update(const MemoryArgs &memory) { +bool ACLCommonExecutor::update(const MemoryArgs& memory) { // Initialize ACL tensors params - ACLShapes aclMemoryShapes; - ACLTypes aclDataType{}; + ACLShapes aclMemoryShapes; + ACLTypes aclDataType{}; ACLLayouts aclDataLayout{}; for (auto& cpu_mem_ptr : memory) { + if (cpu_mem_ptr.second->getSize() == 0) { + continue; + } const ACLArgs index = argConvert.at(cpu_mem_ptr.first); - initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs, + initACLTensorParams(cpu_mem_ptr.second, + aclTensorAttrs, aclMemoryShapes[index], aclDataType[index], aclDataLayout[index]); @@ -110,7 +111,7 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) { return true; } -void ACLCommonExecutor::execute(const MemoryArgs &memory) { +void ACLCommonExecutor::execute(const MemoryArgs& memory) { // TODO: Move import_memory() to update() function - CVS-145871 for (auto& cpu_mem_ptr : memory) { const ACLArgs index = argConvert.at(cpu_mem_ptr.first); @@ -129,5 +130,5 @@ ACLCommonExecutor::~ACLCommonExecutor() { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp index 1a5a00c7a85f7a..650fc5b8c2c7e8 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp @@ -4,27 +4,19 @@ #pragma once +#include "arm_compute/runtime/NEON/NEFunctions.h" #include "cpu_memory.h" #include "nodes/executors/executor.hpp" -#include "arm_compute/runtime/NEON/NEFunctions.h" namespace ov { namespace intel_cpu { -enum ACLArgs { - ACL_SRC_0, - ACL_SRC_1, - ACL_SRC_2, - ACL_BIAS, - ACL_WEI, - ACL_DST, - COUNT_OF_ARGS -}; +enum ACLArgs { ACL_SRC_0, ACL_SRC_1, ACL_SRC_2, ACL_BIAS, ACL_WEI, ACL_DST, ACL_DST_DEQ_SCALE, COUNT_OF_ARGS }; using ACLFunction = std::unique_ptr; -using ACLShapes = std::array; -using ACLInfos = std::array, ACLArgs::COUNT_OF_ARGS>; -using ACLTensors = std::array, ACLArgs::COUNT_OF_ARGS>; +using ACLShapes = std::array; +using ACLInfos = std::array, ACLArgs::COUNT_OF_ARGS>; +using ACLTensors = std::array, ACLArgs::COUNT_OF_ARGS>; struct ACLTensorAttrs { bool hasLayoutTypeNHWC = false; @@ -50,6 +42,10 @@ class ACLCommonExecutor : public Executor { protected: ACLTensorAttrs aclTensorAttrs; + virtual std::shared_ptr initTensorInfo(const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout); + private: ACLTensors aclMemoryTensors; ACLInfos aclMemoryInfos; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp index 1bc0585930387f..ed12b0b76a2c1e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp @@ -3,6 +3,7 @@ // #include "acl_convert.hpp" + #include "acl_utils.hpp" namespace ov { @@ -10,7 +11,6 @@ namespace intel_cpu { using namespace arm_compute; - bool ACLConvertExecutor::init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, @@ -51,10 +51,14 @@ bool ACLConvertExecutor::init(const ConvertParams& convertParams, if (isCopyOp) { acl_copy = std::make_unique(); - configureThreadSafe([&] { acl_copy->configure(&srcTensor, &dstTensor); }); + configureThreadSafe([&] { + acl_copy->configure(&srcTensor, &dstTensor); + }); } else { acl_cast = std::make_unique(); - configureThreadSafe([&] { acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE); }); + configureThreadSafe([&] { + acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE); + }); } return true; } @@ -91,45 +95,34 @@ bool ACLConvertExecutorBuilder::isSupported(const ConvertParams& convertParams, DEBUG_LOG("NECopy does not support source precision: ", convertParams.srcPrc.to_string()); return false; } - if ((convertParams.srcPrc == ov::element::i8 && !one_of(convertParams.dstPrc, - ov::element::i16, - ov::element::i32, - ov::element::f16, - ov::element::f32)) || + if ((convertParams.srcPrc == ov::element::i8 && + !one_of(convertParams.dstPrc, ov::element::i16, ov::element::i32, ov::element::f16, ov::element::f32)) || (convertParams.srcPrc == ov::element::u8 && !one_of(convertParams.dstPrc, - ov::element::u16, - ov::element::i16, - ov::element::i32, - ov::element::f16, - ov::element::f32)) || - (convertParams.srcPrc == ov::element::u16 && !one_of(convertParams.dstPrc, - ov::element::u8, - ov::element::u32)) || - (convertParams.srcPrc == ov::element::i16 && !one_of(convertParams.dstPrc, - ov::element::i8, - ov::element::u8, - ov::element::i32)) || - (convertParams.srcPrc == ov::element::f16 && !one_of(convertParams.dstPrc, - ov::element::i8, - ov::element::f32, + ov::element::u16, + ov::element::i16, ov::element::i32, - ov::element::u8)) || - (convertParams.srcPrc == ov::element::i32 && !one_of(convertParams.dstPrc, - ov::element::i8, - ov::element::f16, - ov::element::f32, - ov::element::u8)) || - (convertParams.srcPrc == ov::element::f32 && !one_of(convertParams.dstPrc, - ov::element::bf16, ov::element::f16, - ov::element::i32))) { + ov::element::f32)) || + (convertParams.srcPrc == ov::element::u16 && + !one_of(convertParams.dstPrc, ov::element::u8, ov::element::u32)) || + (convertParams.srcPrc == ov::element::i16 && + !one_of(convertParams.dstPrc, ov::element::i8, ov::element::u8, ov::element::i32)) || + (convertParams.srcPrc == ov::element::f16 && + !one_of(convertParams.dstPrc, ov::element::i8, ov::element::f32, ov::element::i32, ov::element::u8)) || + (convertParams.srcPrc == ov::element::i32 && + !one_of(convertParams.dstPrc, ov::element::i8, ov::element::f16, ov::element::f32, ov::element::u8)) || + (convertParams.srcPrc == ov::element::f32 && + !one_of(convertParams.dstPrc, ov::element::bf16, ov::element::f16, ov::element::i32))) { DEBUG_LOG("NECopy does not support passed combination of source and destination precisions. ", - "source precision: ", convertParams.srcPrc.to_string(), " destination precsion: ", convertParams.dstPrc.to_string()); + "source precision: ", + convertParams.srcPrc.to_string(), + " destination precsion: ", + convertParams.dstPrc.to_string()); return false; } } return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp index b81e34004f9f31..431f8ce6887cbe 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp @@ -4,9 +4,9 @@ #pragma once +#include "arm_compute/runtime/NEON/NEFunctions.h" #include "nodes/executors/convert.hpp" #include "utils/debug_capabilities.h" -#include "arm_compute/runtime/NEON/NEFunctions.h" namespace ov { namespace intel_cpu { @@ -17,9 +17,12 @@ class ACLConvertExecutor : public ConvertExecutor { bool init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return impl_desc_type::acl; }; + impl_desc_type implType() const override { + return impl_desc_type::acl; + }; + protected: ConvertParams aclConvertParams; bool isCopyOp; @@ -38,5 +41,5 @@ class ACLConvertExecutorBuilder : public ConvertExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp index fa40b3a27322c2..cd5e935b41a2d5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp @@ -3,6 +3,7 @@ // #include "acl_deconv.hpp" + #include "openvino/core/parallel.hpp" namespace ov { @@ -13,7 +14,9 @@ using namespace arm_compute; ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs) { - auto func_mod = [](long a) -> unsigned int { return a < 0 ? 0 : a; }; + auto func_mod = [](long a) -> unsigned int { + return a < 0 ? 0 : a; + }; auto pad_l = deconvAttrs.paddingL.size() > 1 ? deconvAttrs.paddingL.at(1) : deconvAttrs.paddingL.at(0); auto pad_r = deconvAttrs.paddingR.size() > 1 ? deconvAttrs.paddingR.at(1) : deconvAttrs.paddingR.at(0); auto pad_t = deconvAttrs.paddingL.at(0); @@ -21,18 +24,30 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, unsigned int stride_x = (deconvAttrs.stride.size() > 1) ? deconvAttrs.stride.at(1) : deconvAttrs.stride.at(0); unsigned int stride_y = deconvAttrs.stride.at(0); - auto deconv_info = PadStrideInfo(stride_x, stride_y, func_mod(pad_l), func_mod(pad_r), func_mod(pad_t), func_mod(pad_b), DimensionRoundingType::FLOOR); - - auto srcDims = srcDescs[0]->getShape().getDims(); - auto weiDims = srcDescs[1]->getShape().getDims(); - auto dstDims = dstDescs[0]->getShape().getDims(); + auto deconv_info = PadStrideInfo(stride_x, + stride_y, + func_mod(pad_l), + func_mod(pad_r), + func_mod(pad_t), + func_mod(pad_b), + DimensionRoundingType::FLOOR); + + auto srcDims = srcDescs[0]->getShape().getDims(); + auto weiDims = srcDescs[1]->getShape().getDims(); + auto dstDims = dstDescs[0]->getShape().getDims(); // ACL can't work with custom output shape, this we make WA for that problem if (pad_l < 0 || pad_r < 0 || pad_t < 0 || pad_b < 0) { auto out_dims = deconvolution_output_dimensions(srcDims[3], srcDims[2], weiDims[3], weiDims[2], deconv_info); stride_x += (out_dims.first - dstDims[3] - 2 * (pad_l + pad_r)) / (srcDims[3] - 1); stride_y += (out_dims.second - dstDims[2] - 2 * (pad_t + pad_b)) / (srcDims[2] - 1); - deconv_info = PadStrideInfo(stride_x, stride_y, func_mod(pad_l), func_mod(pad_r), func_mod(pad_t), func_mod(pad_b), DimensionRoundingType::FLOOR); + deconv_info = PadStrideInfo(stride_x, + stride_y, + func_mod(pad_l), + func_mod(pad_r), + func_mod(pad_t), + func_mod(pad_b), + DimensionRoundingType::FLOOR); } std::swap(weiDims[0], weiDims[1]); @@ -59,16 +74,18 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, weiLayout = arm_compute::DataLayout::NHWC; } - TensorInfo srcTensorInfo = TensorInfo(srcVecDims, 1, - precisionToAclDataType(srcDescs[0]->getPrecision()), srcLayout); - TensorInfo weiTensorInfo = TensorInfo(weiVecDims, 1, - precisionToAclDataType(srcDescs[1]->getPrecision()), weiLayout); - TensorInfo dstTensorInfo = TensorInfo(dstVecDims, 1, - precisionToAclDataType(dstDescs[0]->getPrecision()), dstLayout); + TensorInfo srcTensorInfo = + TensorInfo(srcVecDims, 1, precisionToAclDataType(srcDescs[0]->getPrecision()), srcLayout); + TensorInfo weiTensorInfo = + TensorInfo(weiVecDims, 1, precisionToAclDataType(srcDescs[1]->getPrecision()), weiLayout); + TensorInfo dstTensorInfo = + TensorInfo(dstVecDims, 1, precisionToAclDataType(dstDescs[0]->getPrecision()), dstLayout); TensorInfo biasTensorInfo; if (deconvAttrs.withBiasesParam) { - biasTensorInfo = TensorInfo(biasVecDims, 1, - precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2])); + biasTensorInfo = TensorInfo(biasVecDims, + 1, + precisionToAclDataType(srcDescs[2]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[2])); } return ACLDeconvTensorInfo{srcTensorInfo, weiTensorInfo, biasTensorInfo, dstTensorInfo, deconv_info}; @@ -77,9 +94,9 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, AclDeconvExecutor::AclDeconvExecutor(const ExecutorContext::CPtr context) : DeconvExecutor(context) {} bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { this->deconvAttrs = deconvAttrs; ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs); TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; @@ -99,12 +116,17 @@ bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs, deconv = std::make_unique(); configureThreadSafe([&] { - deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info, deconvAttrs.aclFastMath); + deconv->configure(&srcTensor, + &weiTensor, + deconvAttrs.withBiasesParam ? &biasTensor : nullptr, + &dstTensor, + deconv_info, + deconvAttrs.aclFastMath); }); return true; } -template +template static void transpose_weights(const MemoryCPtr& srcMemPtr, MemoryPtr& newSrcMemPtr, bool isNCHW) { const auto src_data = srcMemPtr->getDataAs(); const auto new_src_data = newSrcMemPtr->getDataAs(); @@ -118,29 +140,17 @@ static void transpose_weights(const MemoryCPtr& srcMemPtr, MemoryPtr& newSrcMemP if (isNCHW) { parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) { for (int dim3 = 0; dim3 < DIM3; ++dim3) { - const int src_off = dim0 * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; - const int dst_off = dim1 * DIM0 * DIM2 * DIM3 + - dim0 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; + const int src_off = dim0 * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; + const int dst_off = dim1 * DIM0 * DIM2 * DIM3 + dim0 * DIM2 * DIM3 + dim2 * DIM3 + dim3; new_src_data[dst_off] = src_data[src_off]; } }); - // 0231 -> 1230 + // 0231 -> 1230 } else { parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) { for (int dim3 = 0; dim3 < DIM3; ++dim3) { - const int src_off = dim0 * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; - const int dst_off = dim1 * DIM2 * DIM3 * DIM0 + - dim2 * DIM3 * DIM0 + - dim3 * DIM0 + - dim0; + const int src_off = dim0 * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; + const int dst_off = dim1 * DIM2 * DIM3 * DIM0 + dim2 * DIM3 * DIM0 + dim3 * DIM0 + dim0; new_src_data[dst_off] = src_data[src_off]; } }); @@ -176,7 +186,9 @@ static MemoryPtr prepareWeightMemory(const std::vector& src, const E return create(); } -void AclDeconvExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void AclDeconvExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { // TODO: Remove transpose from exec auto newWei = prepareWeightMemory(src, context); @@ -194,16 +206,19 @@ void AclDeconvExecutor::exec(const std::vector& src, const std::vect biasTensor.allocator()->free(); } -bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs) { +bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) { if ((srcDescs[0]->getShape().getDims().size() != 3 && srcDescs[0]->getShape().getDims().size() != 4) || dstDescs[0]->getShape().getDims().size() != srcDescs[0]->getShape().getDims().size() || srcDescs[1]->getShape().getDims().size() != 4) { DEBUG_LOG("AclDeconvExecutor does not support dimension:", - " src[0]=", srcDescs[0]->getShape().getDims().size(), - " src[1]=", srcDescs[1]->getShape().getDims().size(), - " dst[0]=", dstDescs[0]->getShape().getDims().size()); + " src[0]=", + srcDescs[0]->getShape().getDims().size(), + " src[1]=", + srcDescs[1]->getShape().getDims().size(), + " dst[0]=", + dstDescs[0]->getShape().getDims().size()); return false; } @@ -211,62 +226,71 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, srcDescs[0]->getPrecision() == srcDescs[1]->getPrecision() && srcDescs[1]->getPrecision() == dstDescs[0]->getPrecision())) { DEBUG_LOG("AclDeconvExecutor does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " src[1]=", srcDescs[1]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " src[1]=", + srcDescs[1]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } if (deconvAttrs.withBiasesParam && srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) { - DEBUG_LOG("AclDeconvExecutor does not support precisions:", - " src[2]=", srcDescs[2]->getPrecision()); + DEBUG_LOG("AclDeconvExecutor does not support precisions:", " src[2]=", srcDescs[2]->getPrecision()); return false; } - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - srcDescs[1]->hasLayoutType(LayoutType::ncsp) && + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && srcDescs[1]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && // Check weights as ncsp because we remove reorder and will transform ncsp -> nspc in exec() function - srcDescs[1]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + srcDescs[1]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { DEBUG_LOG("AclDeconvExecutor does not support layouts:", - " src[0]=", srcDescs[0]->serializeFormat(), - " src[1]=", srcDescs[1]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); + " src[0]=", + srcDescs[0]->serializeFormat(), + " src[1]=", + srcDescs[1]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); return false; } - if (deconvAttrs.withBiasesParam && - !(srcDescs[2]->hasLayoutType(LayoutType::ncsp)) && + if (deconvAttrs.withBiasesParam && !(srcDescs[2]->hasLayoutType(LayoutType::ncsp)) && !(srcDescs[2]->hasLayoutType(LayoutType::nspc))) { DEBUG_LOG("AclDeconvExecutor does not support layouts:", - " src[0]=", srcDescs[0]->serializeFormat(), - " src[1]=", srcDescs[1]->serializeFormat(), - " src[2]=", srcDescs[2]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); + " src[0]=", + srcDescs[0]->serializeFormat(), + " src[1]=", + srcDescs[1]->serializeFormat(), + " src[2]=", + srcDescs[2]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); return false; } ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs); - auto srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; - auto weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo; + auto srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; + auto weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo; auto biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo; - auto dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo; - auto deconv_info = aclDeconvTensorInfo.deconv_info; + auto dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo; + auto deconv_info = aclDeconvTensorInfo.deconv_info; - unsigned int dilation_x = (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0); + unsigned int dilation_x = + (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0); unsigned int dilation_y = deconvAttrs.dilation.at(0); - if (!one_of(dilation_x, static_cast(0), static_cast(1)) || - !one_of(dilation_y, static_cast(0), static_cast(1))) return false; + if (!one_of(dilation_x, static_cast(0), static_cast(1)) || + !one_of(dilation_y, static_cast(0), static_cast(1))) + return false; try { - arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo, - &weiTensorInfo, - deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr, - &dstTensorInfo, - deconv_info, - deconvAttrs.aclFastMath); + arm_compute::Status status = + arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo, + &weiTensorInfo, + deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr, + &dstTensorInfo, + deconv_info, + deconvAttrs.aclFastMath); if (!status) { DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description()); return false; @@ -280,5 +304,5 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp index ad743690f45c52..e27551ad48d44d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp @@ -4,11 +4,11 @@ #pragma once -#include "nodes/executors/deconv.hpp" -#include "arm_compute/runtime/NEON/NEFunctions.h" -#include "utils/debug_capabilities.h" #include "acl_utils.hpp" +#include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/deconv.hpp" #include "src/cpu/CpuTypes.h" +#include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { @@ -22,8 +22,8 @@ struct ACLDeconvTensorInfo { }; ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs); + const std::vector& srcDescs, + const std::vector& dstDescs); class AclDeconvExecutor : public DeconvExecutor { public: @@ -31,10 +31,10 @@ class AclDeconvExecutor : public DeconvExecutor { bool init(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -68,5 +68,5 @@ class AclDeconvExecutorBuilder : public DeconvExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index 942bacd91349ff..26d387c7659dc5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -3,6 +3,7 @@ // #include "acl_eltwise.hpp" + #include "acl_utils.hpp" #include "utils/debug_capabilities.h" @@ -34,40 +35,45 @@ inline void log_unsupported_prec(const std::vector& srcDescs, for (size_t i = 0; i < srcDescs.size(); i++) { srcPrec += srcDescs[i]->getPrecision().to_string() + " "; } - DEBUG_LOG(algToString(eltwiseAlgorithm), ": provided combination of src precisions: [", srcPrec, - "] and dst precision: ", dstDescs[0]->getPrecision().to_string(), " is not supported"); + DEBUG_LOG(algToString(eltwiseAlgorithm), + ": provided combination of src precisions: [", + srcPrec, + "] and dst precision: ", + dstDescs[0]->getPrecision().to_string(), + " is not supported"); } bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) { - if (one_of(algorithm, Algorithm::EltwiseSqrt, - Algorithm::EltwiseDivide, - Algorithm::EltwiseRelu, + if (one_of(algorithm, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseDivide, + Algorithm::EltwiseRelu, #ifdef OPENVINO_ARCH_ARM64 - Algorithm::EltwiseGeluErf, + Algorithm::EltwiseGeluErf, #endif - Algorithm::EltwiseElu, - Algorithm::EltwiseTanh, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseSoftRelu, - Algorithm::EltwiseClamp, - Algorithm::EltwiseSwish, - Algorithm::EltwisePrelu, - Algorithm::EltwiseHswish, - Algorithm::EltwiseAbs, - Algorithm::EltwiseExp, - Algorithm::EltwiseLog, - Algorithm::EltwiseMaximum, - Algorithm::EltwiseMinimum, - Algorithm::EltwiseSquaredDifference, - Algorithm::EltwiseAdd, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseEqual, - Algorithm::EltwiseNotEqual, - Algorithm::EltwiseGreater, - Algorithm::EltwiseGreaterEqual, - Algorithm::EltwiseLess, - Algorithm::EltwiseLessEqual)) { + Algorithm::EltwiseElu, + Algorithm::EltwiseTanh, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseSoftRelu, + Algorithm::EltwiseClamp, + Algorithm::EltwiseSwish, + Algorithm::EltwisePrelu, + Algorithm::EltwiseHswish, + Algorithm::EltwiseAbs, + Algorithm::EltwiseExp, + Algorithm::EltwiseLog, + Algorithm::EltwiseMaximum, + Algorithm::EltwiseMinimum, + Algorithm::EltwiseSquaredDifference, + Algorithm::EltwiseAdd, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseEqual, + Algorithm::EltwiseNotEqual, + Algorithm::EltwiseGreater, + Algorithm::EltwiseGreaterEqual, + Algorithm::EltwiseLess, + Algorithm::EltwiseLessEqual)) { return true; } return false; @@ -76,107 +82,111 @@ bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) { bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const { - auto checkPrecision = [&srcDescs, &dstDescs](std::vector srcVecPrc, ov::element::Type dstPrc) -> bool { + auto checkPrecision = [&srcDescs, &dstDescs](std::vector srcVecPrc, + ov::element::Type dstPrc) -> bool { for (size_t i = 0; i < srcDescs.size(); i++) { - if (srcDescs[i]->getPrecision() != srcVecPrc[i]) return false; + if (srcDescs[i]->getPrecision() != srcVecPrc[i]) + return false; + } + if (dstDescs[0]->getPrecision() != dstPrc) { + return false; } - if (dstDescs[0]->getPrecision() != dstPrc) { return false; } return true; }; switch (eltwiseAttrs.algorithm) { - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseDivide: - case Algorithm::EltwiseRelu: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseDivide: + case Algorithm::EltwiseRelu: #ifdef OPENVINO_ARCH_ARM64 - case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseGeluErf: #endif - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwisePrelu: - case Algorithm::EltwiseHswish: - if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseAbs: - case Algorithm::EltwiseExp: - case Algorithm::EltwiseLog: - if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseMaximum: - case Algorithm::EltwiseMinimum: - case Algorithm::EltwiseSquaredDifference: - if (!(checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseAdd: - case Algorithm::EltwiseSubtract: - if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || - checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseMultiply: - if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || - checkPrecision({ov::element::u8, ov::element::u8}, ov::element::i16) || - checkPrecision({ov::element::u8, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::i16, ov::element::u8}, ov::element::i16) || - checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - // ACL supports only U8 precision on output for comparison operations - case Algorithm::EltwiseEqual: - case Algorithm::EltwiseNotEqual: - case Algorithm::EltwiseGreater: - case Algorithm::EltwiseGreaterEqual: - case Algorithm::EltwiseLess: - case Algorithm::EltwiseLessEqual: - if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || - checkPrecision({ov::element::i16, ov::element::i16}, ov::element::u8) || - checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - default: - DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported"); + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwisePrelu: + case Algorithm::EltwiseHswish: + if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + case Algorithm::EltwiseAbs: + case Algorithm::EltwiseExp: + case Algorithm::EltwiseLog: + if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + case Algorithm::EltwiseMaximum: + case Algorithm::EltwiseMinimum: + case Algorithm::EltwiseSquaredDifference: + if (!(checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + case Algorithm::EltwiseAdd: + case Algorithm::EltwiseSubtract: + if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || + checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; + } + break; + case Algorithm::EltwiseMultiply: + if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || + checkPrecision({ov::element::u8, ov::element::u8}, ov::element::i16) || + checkPrecision({ov::element::u8, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::i16, ov::element::u8}, ov::element::i16) || + checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + // ACL supports only U8 precision on output for comparison operations + case Algorithm::EltwiseEqual: + case Algorithm::EltwiseNotEqual: + case Algorithm::EltwiseGreater: + case Algorithm::EltwiseGreaterEqual: + case Algorithm::EltwiseLess: + case Algorithm::EltwiseLessEqual: + if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || + checkPrecision({ov::element::i16, ov::element::i16}, ov::element::u8) || + checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + default: + DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported"); + return false; } - for (const auto & srcDesc : srcDescs) { + for (const auto& srcDesc : srcDescs) { if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN) { DEBUG_LOG("src descriptor layout is unsupported by ACL: ", srcDesc->serializeFormat()); return false; } } - for (const auto & dstDesc : dstDescs) { + for (const auto& dstDesc : dstDescs) { if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN) { DEBUG_LOG("dst descriptor layout is unsupported by ACL: ", dstDesc->serializeFormat()); return false; @@ -188,10 +198,13 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, AclEltwiseExecutor::AclEltwiseExecutor(const ExecutorContext::CPtr context) : EltwiseExecutor(context) {} -bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vector &srcDescs, - const std::vector &dstDescs, - const std::vector &postOps) { - if (!postOps.empty()) { return false; } +bool AclEltwiseExecutor::init(const EltwiseAttrs& eltwiseAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const std::vector& postOps) { + if (!postOps.empty()) { + return false; + } aclEltwiseAttrs = eltwiseAttrs; std::vector srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size()); @@ -209,15 +222,19 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto for (size_t i = 0; i < srcDescs.size(); i++) { srcDataLayout[i] = getAclDataLayoutByMemoryDesc(srcDescs[i]); - if (srcDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; } + if (srcDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { + return false; + } } for (size_t i = 0; i < dstDescs.size(); i++) { dstDataLayout[i] = getAclDataLayoutByMemoryDesc(dstDescs[i]); - if (dstDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; } + if (dstDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { + return false; + } } - if (srcDescs.size() == 2 && - srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) && + if (srcDescs.size() == 2 && srcDescs[0]->hasLayoutType(LayoutType::nspc) && + srcDescs[1]->hasLayoutType(LayoutType::nspc) && srcDescs[0]->getShape().getDims() != srcDescs[1]->getShape().getDims()) { if (srcVecDims[0].num_dimensions() < 5) { srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCHW; @@ -228,210 +245,248 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto } for (size_t i = 0; i < srcVecDims.size(); i++) { - srcTensorsInfo[i] = TensorInfo(srcVecDims[i], 1, - precisionToAclDataType(srcDescs[i]->getPrecision()), - srcDataLayout[i]); + srcTensorsInfo[i] = + TensorInfo(srcVecDims[i], 1, precisionToAclDataType(srcDescs[i]->getPrecision()), srcDataLayout[i]); srcTensors[i].allocator()->init(srcTensorsInfo[i]); } for (size_t i = 0; i < dstVecDims.size(); i++) { - dstTensorsInfo[i] = TensorInfo(dstVecDims[i], 1, - precisionToAclDataType(dstDescs[i]->getPrecision()), - dstDataLayout[i]); + dstTensorsInfo[i] = + TensorInfo(dstVecDims[i], 1, precisionToAclDataType(dstDescs[i]->getPrecision()), dstDataLayout[i]); dstTensors[i].allocator()->init(dstTensorsInfo[i]); } std::function(void)> exec_func; switch (aclEltwiseAttrs.algorithm) { - case Algorithm::EltwiseAdd: - if (!NEArithmeticAddition::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); - return acl_op; - }; - break; - case Algorithm::EltwiseMultiply: - if (!NEPixelWiseMultiplication::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], - 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - return acl_op; - }; - break; - case Algorithm::EltwiseSubtract: - if (!NEArithmeticSubtraction::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); - return acl_op; - }; - break; - case Algorithm::EltwiseDivide: - if (!NEElementwiseDivision::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseMaximum: - if (!NEElementwiseMax::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseMinimum: - if (!NEElementwiseMin::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseSquaredDifference: - if (!NEElementwiseSquaredDiff::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Equal)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Equal); - return acl_op; - }; - break; - case Algorithm::EltwiseNotEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::NotEqual)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::NotEqual); - return acl_op; - }; - break; - case Algorithm::EltwiseGreater: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Greater)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Greater); - return acl_op; - }; - break; - case Algorithm::EltwiseGreaterEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::GreaterEqual)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::GreaterEqual); - return acl_op; - }; - break; - case Algorithm::EltwiseLess: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Less)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Less); - return acl_op; - }; - break; - case Algorithm::EltwiseLessEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::LessEqual)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::LessEqual); - return acl_op; - }; - break; - case Algorithm::EltwiseAbs: - if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseExp: - if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwisePrelu: - if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm, - aclEltwiseAttrs.alpha, - aclEltwiseAttrs.beta, - aclEltwiseAttrs.gamma))) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm, - aclEltwiseAttrs.alpha, - aclEltwiseAttrs.beta, - aclEltwiseAttrs.gamma)); - return acl_op; - }; - break; - case Algorithm::EltwiseLog: - if (!NELogLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0]); - return acl_op; - }; - break; - default: - OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", - static_cast(aclEltwiseAttrs.algorithm)); + case Algorithm::EltwiseAdd: + if (!NEArithmeticAddition::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ConvertPolicy::SATURATE)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); + return acl_op; + }; + break; + case Algorithm::EltwiseMultiply: + if (!NEPixelWiseMultiplication::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + 1.0f, + ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], + &srcTensors[1], + &dstTensors[0], + 1.0f, + ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + return acl_op; + }; + break; + case Algorithm::EltwiseSubtract: + if (!NEArithmeticSubtraction::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ConvertPolicy::SATURATE)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); + return acl_op; + }; + break; + case Algorithm::EltwiseDivide: + if (!NEElementwiseDivision::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseMaximum: + if (!NEElementwiseMax::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseMinimum: + if (!NEElementwiseMin::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseSquaredDifference: + if (!NEElementwiseSquaredDiff::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::Equal)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Equal); + return acl_op; + }; + break; + case Algorithm::EltwiseNotEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::NotEqual)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::NotEqual); + return acl_op; + }; + break; + case Algorithm::EltwiseGreater: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::Greater)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Greater); + return acl_op; + }; + break; + case Algorithm::EltwiseGreaterEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::GreaterEqual)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::GreaterEqual); + return acl_op; + }; + break; + case Algorithm::EltwiseLess: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::Less)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Less); + return acl_op; + }; + break; + case Algorithm::EltwiseLessEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::LessEqual)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::LessEqual); + return acl_op; + }; + break; + case Algorithm::EltwiseAbs: + if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseExp: + if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwisePrelu: + if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + if (!NEActivationLayer::validate(&srcTensorsInfo[0], + &dstTensorsInfo[0], + getActivationLayerInfo(aclEltwiseAttrs.algorithm, + aclEltwiseAttrs.alpha, + aclEltwiseAttrs.beta, + aclEltwiseAttrs.gamma))) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], + &dstTensors[0], + getActivationLayerInfo(aclEltwiseAttrs.algorithm, + aclEltwiseAttrs.alpha, + aclEltwiseAttrs.beta, + aclEltwiseAttrs.gamma)); + return acl_op; + }; + break; + case Algorithm::EltwiseLog: + if (!NELogLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &dstTensors[0]); + return acl_op; + }; + break; + default: + OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", + static_cast(aclEltwiseAttrs.algorithm)); } - configureThreadSafe([&] { ifunc = exec_func(); }); + configureThreadSafe([&] { + ifunc = exec_func(); + }); return true; } -void AclEltwiseExecutor::exec(const std::vector &src, const std::vector &dst, - const void *post_ops_data_) { +void AclEltwiseExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { for (size_t i = 0; i < src.size(); i++) { srcTensors[i].allocator()->import_memory(src[i]->getData()); } @@ -448,5 +503,5 @@ void AclEltwiseExecutor::exec(const std::vector &src, const std::vec dstTensors[i].allocator()->free(); } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp index 6daf9e606c461b..1aae396f25a0fe 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp @@ -5,8 +5,8 @@ #pragma once #include "../eltwise.hpp" -#include "arm_compute/runtime/NEON/NEFunctions.h" #include "acl_utils.hpp" +#include "arm_compute/runtime/NEON/NEFunctions.h" namespace ov { namespace intel_cpu { @@ -23,11 +23,12 @@ class AclEltwiseExecutor : public EltwiseExecutor { void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; } + private: EltwiseAttrs aclEltwiseAttrs{}; impl_desc_type implType = impl_desc_type::acl; @@ -46,5 +47,5 @@ class AclEltwiseExecutorBuilder : public EltwiseExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index cc42691950a3ff..e4dbb1a3a37940 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -2,253 +2,29 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "acl_fullyconnected.hpp" + #include +#include -#include "ov_optional.hpp" -#include "acl_fullyconnected.hpp" #include "acl_utils.hpp" -#include "nodes/executors/executor.hpp" -#include "nodes/executors/memory_arguments.hpp" -#include "utils/debug_capabilities.h" -#include "nodes/executors/debug_messages.hpp" -#include "nodes/executors/implementation_utils.hpp" -#include "nodes/convert.h" +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/common/cpu_convert.h" #include "nodes/common/cpu_memcpy.h" #include "nodes/common/reorder_prim.h" -#include "memory_desc/cpu_memory_desc_utils.h" +#include "nodes/convert.h" +#include "nodes/executors/debug_messages.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/implementation_utils.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "ov_optional.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { -static VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape) { - const auto& weightDims = wShape.getStaticDims(); - - auto inMinDims = inShape.getMinDims(); - auto inMaxDims = inShape.getMaxDims(); - inMinDims.back() = weightDims.back(); - inMaxDims.back() = weightDims.back(); - - return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims(); -} - -static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) { - size_t activationRank = inShape.size(); - size_t channelRank = wShape.size() - 1; - // activation weight output_shape - // NCHW CoCHW NCo - // TNC CoC TNCo - // NC CoC NCo - VectorDims outputShape(out_rank, 1); - // set Co - outputShape.back() = wShape[0]; - // set batch dims - size_t batchRank = activationRank - channelRank; - size_t startIdx = out_rank - batchRank - 1; - for (size_t i = 0; i < batchRank; i++) { - outputShape[i + startIdx] = inShape[i]; - } - - return outputShape; -} - -static DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, - const DnnlMemoryDescPtr dstDesc) { - const auto& weiDesc = srcDesc->getDnnlDesc(); - const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; - const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); - - return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); -} - -static ov::optional convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC] = input; - memoryArgs[ARG_DST] = output; - - auto aclWeightsConverter = std::make_shared(); - if (aclWeightsConverter->update(memoryArgs)) { - aclWeightsConverter->execute(memoryArgs); - return ov::optional(memoryArgs.at(ARG_DST)); - } - - if (!node::Convert::isSupportedDesc(input->getDesc()) || - !node::Convert::isSupportedDesc(output->getDesc())) { - return {}; - } - - auto data = static_cast(input->getData()); - std::vector tmpBuff; - tmpBuff.resize(output->getSize()); - cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()), - weightPrecision, input->getSize() / input->getDesc().getPrecision().size()); - - return ov::optional(std::make_shared(output->getPrimitive().get_engine(), - output->getDesc().cloneWithNewPrecision(weightPrecision), - tmpBuff.data())); -} - -static ov::optional reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context) { - if (output->getDataType() == input->getDataType()) { - return {}; - } - const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType()); - auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc); - dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(), - output->getPrimitive().get_engine(), - input->getPrimitive().get_desc(), - MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc()); - - if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) { - auto convertOutput = convertWeightPrecision(input, output, inPrc); - if (!convertOutput) { - return {}; - } - input = *convertOutput; - - if (reorderWithoutConvert) { - dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order); - reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); - return ov::optional(output); - } - } - return {}; -} - -static MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc, - DnnlMemoryDescPtr dstWeightDesc, - MemoryCPtr weightsMem, - ExecutorContext::CPtr context) { - MemoryPtr input = std::make_shared(context->getEngine(), srcWeightDesc, weightsMem->getData()); - MemoryPtr output = std::make_shared(context->getEngine(), dstWeightDesc); - if (!input->getDesc().isDefined() || !output->getDesc().isDefined()) - OPENVINO_THROW("Can't reorder data with dynamic shapes"); - - if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) { - return output; - } - - if (input->getDesc().isCompatible(output->getDesc())) { - auto srcPtr = static_cast(input->getData()); - auto dstPtr = static_cast(output->getData()); - auto copySize = output->getSize(); - cpu_memcpy(dstPtr, srcPtr, copySize); - return output; - } - - // try directly reorder - auto engine = output->getPrimitive().get_engine(); - dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(), - engine, - input->getPrimitive().get_desc(), - output->getPrimitive().get_desc()); - - if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) { - // try precision conversion then do the reorder - auto fallbackOutput = reorderDataFallback(input, output, context); - if (fallbackOutput) { - return *fallbackOutput; - } - } - // if precision conversion does not work then do direct reference reorder - if (directReorder) { - dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order); - directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); - } else { - OPENVINO_THROW("Could not make onednn reorder."); - } - return output; -} - -static MemoryPtr reorderWeights(const MemoryArgs &memory, - const ExecutorContext::CPtr context, - ACLFCAttrs& aclfcAttrs, - DnnlMemoryDescPtr dnnlSrcDesc, - DnnlMemoryDescPtr dnnlDstDesc) { - auto create = [&]() { - MemoryPtr weightsMemory = memory.at(ARG_WEI); - if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) { - weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context); - DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); - } - return weightsMemory; - }; - - auto weightCache = context->getWeightsCache(); - if (weightCache != nullptr) { - const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); - const auto N = wgtDims[0]; - const auto K = wgtDims[1]; - std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); - const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + - std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); - DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash); - return *weightCache->findOrCreate(string_hash, create); - } - - DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available"); - return create(); -} - -static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, - const ExecutorContext::CPtr context, - const FCAttrs &attrs, - ACLFCAttrs& aclfcAttrs, - const PostOps &postOps, - arm_compute::WeightFormat& expectedWeightFormat, - arm_compute::TensorInfo& weiTensorInfo) { - MemoryArgs memoryArgs; - memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); - memoryArgs[ARG_WEI] = memory.at(ARG_WEI); - if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { - const auto& inShape = memory.at(ARG_SRC_0)->getShape(); - const auto& wShape = memory.at(ARG_WEI)->getShape(); - const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); - const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); - memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), - memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); - } else { - memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); - memoryArgs[ARG_DST] = memory.at(ARG_DST); - } - // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor - // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability - auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); - bool isNeededReorder = aclWeightsRepack->update(memoryArgs); - expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; - weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI); - - MemoryPtr dstMemPtr = std::make_shared(context->getEngine(), - memory.at(ARG_WEI)->getDescPtr()->cloneWithNewPrecision(aclfcAttrs.inputPrecision)); - auto dstDesc = dstMemPtr->getDescPtr(); - auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); - auto weiDesc = memory.at(ARG_WEI)->getDescPtr(); - auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc); - - if (isNeededReorder) { - dnnl::impl::dim_t o_dim = 0; - dnnl::impl::dim_t inner_dim = 1; - std::vector remaining_dims = {}; - auto weights_md_ = dnnlDstDesc->getDnnlDesc().get(); - dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat, - inner_dim, o_dim, remaining_dims, {}); - if (aclfcAttrs.weightsNonTransposed) { - dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc); - } - aclfcAttrs.isWeightsRepacked = true; - return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); - } - if (!aclfcAttrs.weightsNonTransposed) { - dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc); - aclfcAttrs.isWeightsRepacked = true; - } - return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); -} - -static bool checkPostOps(const PostOps &postOps) { +static bool checkPostOps(const PostOps& postOps) { if (postOps.empty()) { return true; } @@ -263,12 +39,12 @@ static bool checkPostOps(const PostOps &postOps) { return false; } -static void initFCAttrs(const FCAttrs &attrs, +static void initFCAttrs(const FCAttrs& attrs, ACLTensorAttrs& aclTensorAttrs, ACLFCAttrs& aclfcAttrs, - const MemoryArgs &memory, + const MemoryArgs& memory, arm_compute::FullyConnectedLayerInfo& fullyConnectedLayerInfo, - const PostOps &postOps) { + const PostOps& postOps) { aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); @@ -277,9 +53,10 @@ static void initFCAttrs(const FCAttrs &attrs, if (!postOps.empty() && checkPostOps(postOps)) { auto activation = std::dynamic_pointer_cast(postOps[0]); - fullyConnectedLayerInfo.activation_info = getActivationLayerInfo( - convertToEltwiseAlgorithm(activation->type()), - activation->alpha(), activation->beta(), activation->gamma()); + fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), + activation->beta(), + activation->gamma()); } if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { @@ -287,69 +64,62 @@ static void initFCAttrs(const FCAttrs &attrs, } } -ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, - const PostOps &postOps, - const MemoryArgs &memory, +ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, const ExecutorContext::CPtr context) { initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); - packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo); + packedWeights = acl_fc_executor::prepareWeightMemory(memory, + context, + attrs, + aclfcAttrs, + postOps, + expectedWeightFormat, + weiTensorInfo); } -bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { +bool ACLFullyConnectedExecutor::supports(const FCConfig& config) { VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS); VERIFY(one_of(weiType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_WEI_PRECISIONS); - VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS); - VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS); - VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); - VERIFY(one_of(weiRank(config), 2U, 3U), UNSUPPORTED_WEI_RANK); + VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS); + VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS); + VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); + VERIFY(one_of(weiRank(config), 2U, 3U), UNSUPPORTED_WEI_RANK); return true; } -static arm_compute::TensorShape normalizeDimsTo2D(const arm_compute::TensorShape shape) { - size_t norm_dim = std::accumulate(shape.begin() + 1, shape.end(), 1, std::multiplies()); - return arm_compute::TensorShape(shape[0], norm_dim); -} - -static void updateFCTensorsShapes(ACLShapes& aclMemoryShapes) { - aclMemoryShapes[ACLArgs::ACL_WEI] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_WEI]); - aclMemoryShapes[ACLArgs::ACL_SRC_0] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_SRC_0]); - aclMemoryShapes[ACLArgs::ACL_DST] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_DST]); - std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]); -} - void ACLFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) { - updateFCTensorsShapes(aclMemoryShapes); + acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes); } -arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) { +arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLInfos& aclMemoryInfos) { if (aclfcAttrs.isConvertedWeights) { aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); } int ic_total = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0); return arm_compute::NEFullyConnectedLayer::validate( - aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - &weiTensorInfo, - aclMemoryInfos[ACLArgs::ACL_BIAS].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED ? - arm_compute::WeightsInfo() : - arm_compute::WeightsInfo(false, 1, 1, ic_total, false, expectedWeightFormat)); + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + &weiTensorInfo, + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED + ? arm_compute::WeightsInfo() + : arm_compute::WeightsInfo(false, 1, 1, ic_total, false, expectedWeightFormat)); } -ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclMemoryTensors) { +ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors& aclMemoryTensors) { auto neFC = std::make_unique(); aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->init(weiTensorInfo); int icTotal = aclMemoryTensors[ACLArgs::ACL_WEI]->info()->dimension(0); - neFC->configure( - aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_WEI].get(), - aclMemoryTensors[ACLArgs::ACL_BIAS].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED ? - arm_compute::WeightsInfo() : - arm_compute::WeightsInfo(false, 1, 1, icTotal, false, expectedWeightFormat)); + neFC->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_WEI].get(), + aclMemoryTensors[ACLArgs::ACL_BIAS].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED + ? arm_compute::WeightsInfo() + : arm_compute::WeightsInfo(false, 1, 1, icTotal, false, expectedWeightFormat)); // TODO: get rid of those flags and decide whether to import memory or not just based on input type if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) { aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false; @@ -358,48 +128,5 @@ ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclM return neFC; } -arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { - return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - arm_compute::ConvertPolicy::SATURATE); -} - -ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) { - auto neCast = std::make_unique(); - neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - arm_compute::ConvertPolicy::SATURATE); - return neCast; -} - -acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs, - const PostOps &postOps, - const MemoryArgs &memory) { - initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); -} - -void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) { - updateFCTensorsShapes(aclMemoryShapes); -} - -arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { - if (aclfcAttrs.isConvertedWeights) { - aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); - } - int icTotal = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0); - return arm_compute::NEFullyConnectedLayer::has_opt_impl( - expectedWeightFormat, - aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_WEI].get(), - aclMemoryInfos[ACLArgs::ACL_BIAS].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - arm_compute::WeightsInfo(false, 1, 1, icTotal, false, arm_compute::WeightFormat::ANY)); -} - -ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) { - return std::make_unique(); -} - -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp index fcbcb1475efa15..6c1a2f0576e283 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp @@ -1,65 +1,30 @@ -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once #include "acl_common_executor.hpp" +#include "acl_fullyconnected_utils.hpp" #include "nodes/executors/fullyconnected_config.hpp" namespace ov { namespace intel_cpu { -struct ACLFCAttrs { - ov::element::Type inputPrecision; - bool isConvertedWeights = false; - bool isWeightsRepacked = false; - bool weightsNonTransposed; -}; - -namespace acl_fc_executor { - -class ACLWeightsConverter : public ACLCommonExecutor { -public: - ACLWeightsConverter() = default; - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; -}; - -class ACLWeightFormatGenerator : public ACLCommonExecutor { -public: - ACLWeightFormatGenerator(const FCAttrs& attrs, - const PostOps& postOps, - const MemoryArgs& memory); - void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; - arm_compute::WeightFormat getOptImplWeightFormat() { - return expectedWeightFormat; - } -private: - arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; - ACLFCAttrs aclfcAttrs; - arm_compute::WeightFormat expectedWeightFormat; -}; - -} // namespace acl_fc_executor - class ACLFullyConnectedExecutor : public ACLCommonExecutor { public: ACLFullyConnectedExecutor(const FCAttrs& attrs, - const PostOps& postOps, - const MemoryArgs& memory, - const ExecutorContext::CPtr context); + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr context); static bool supports(const FCConfig& config); void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; + arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; + ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override; private: arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp new file mode 100644 index 00000000000000..0c3e208381497f --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.cpp @@ -0,0 +1,367 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include + +#include "acl_fullyconnected.hpp" +#include "acl_utils.hpp" +#include "memory_desc/cpu_memory_desc_utils.h" +#include "nodes/common/cpu_convert.h" +#include "nodes/common/cpu_memcpy.h" +#include "nodes/common/reorder_prim.h" +#include "nodes/convert.h" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/debug_capabilities.h" + +namespace ov { +namespace intel_cpu { + +VectorDims acl_fc_executor::makeDummyInputDims(const Shape& inShape, const Shape& wShape) { + const auto& weightDims = wShape.getStaticDims(); + + auto inMinDims = inShape.getMinDims(); + auto inMaxDims = inShape.getMaxDims(); + inMinDims.back() = weightDims.back(); + inMaxDims.back() = weightDims.back(); + + return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims(); +} + +VectorDims acl_fc_executor::makeDummyOutputDims(const VectorDims& inShape, + const VectorDims& wShape, + const size_t out_rank) { + size_t activationRank = inShape.size(); + size_t channelRank = wShape.size() - 1; + // activation weight output_shape + // NCHW CoCHW NCo + // TNC CoC TNCo + // NC CoC NCo + VectorDims outputShape(out_rank, 1); + // set Co + outputShape.back() = wShape[0]; + // set batch dims + size_t batchRank = activationRank - channelRank; + size_t startIdx = out_rank - batchRank - 1; + for (size_t i = 0; i < batchRank; i++) { + outputShape[i + startIdx] = inShape[i]; + } + + return outputShape; +} + +DnnlMemoryDescPtr acl_fc_executor::makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, + const DnnlMemoryDescPtr dstDesc) { + const auto& weiDesc = srcDesc->getDnnlDesc(); + dnnl::memory::dims wgtDims2D = reshapeDownToRank<2>(weiDesc.get_dims()); + const auto reorderedWeiDesc = dnnl::memory::desc{wgtDims2D, weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; + const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); + + return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); +} + +ov::optional acl_fc_executor::convertWeightPrecision(MemoryPtr input, + MemoryPtr output, + ov::element::Type weightPrecision) { + MemoryArgs memoryArgs; + memoryArgs[ARG_SRC] = input; + memoryArgs[ARG_DST] = output; + + auto aclWeightsConverter = std::make_shared(); + if (aclWeightsConverter->update(memoryArgs)) { + aclWeightsConverter->execute(memoryArgs); + return ov::optional(memoryArgs.at(ARG_DST)); + } + + if (!node::Convert::isSupportedDesc(input->getDesc()) || !node::Convert::isSupportedDesc(output->getDesc())) { + return {}; + } + + auto data = static_cast(input->getData()); + std::vector tmpBuff; + tmpBuff.resize(output->getSize()); + cpu_convert(data, + tmpBuff.data(), + DnnlExtensionUtils::DataTypeToElementType(input->getDataType()), + weightPrecision, + input->getSize() / input->getDesc().getPrecision().size()); + + return ov::optional(std::make_shared(output->getPrimitive().get_engine(), + output->getDesc().cloneWithNewPrecision(weightPrecision), + tmpBuff.data())); +} + +ov::optional acl_fc_executor::reorderDataFallback(MemoryPtr input, + MemoryPtr output, + ExecutorContext::CPtr context) { + if (output->getDataType() == input->getDataType()) { + return {}; + } + const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType()); + auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc); + dnnl::reorder reorderWithoutConvert = + getReorderPrim(context->getRuntimeCache(), + output->getPrimitive().get_engine(), + input->getPrimitive().get_desc(), + MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc()); + + if (reorderWithoutConvert && + parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) { + auto convertOutput = convertWeightPrecision(input, output, inPrc); + if (!convertOutput) { + return {}; + } + input = *convertOutput; + + if (reorderWithoutConvert) { + dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order); + reorderWithoutConvert.execute( + loc_stream, + {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); + return ov::optional(output); + } + } + return {}; +} + +MemoryPtr acl_fc_executor::reorderData(DnnlMemoryDescPtr srcWeightDesc, + DnnlMemoryDescPtr dstWeightDesc, + MemoryCPtr weightsMem, + ExecutorContext::CPtr context) { + MemoryPtr input = std::make_shared(context->getEngine(), srcWeightDesc, weightsMem->getData()); + MemoryPtr output = std::make_shared(context->getEngine(), dstWeightDesc); + if (!input->getDesc().isDefined() || !output->getDesc().isDefined()) + OPENVINO_THROW("Can't reorder data with dynamic shapes"); + + if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) { + return output; + } + + if (input->getDesc().isCompatible(output->getDesc())) { + auto srcPtr = static_cast(input->getData()); + auto dstPtr = static_cast(output->getData()); + auto copySize = output->getSize(); + cpu_memcpy(dstPtr, srcPtr, copySize); + return output; + } + + // try directly reorder + auto engine = output->getPrimitive().get_engine(); + dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(), + engine, + input->getPrimitive().get_desc(), + output->getPrimitive().get_desc()); + + if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) { + // try precision conversion then do the reorder + auto fallbackOutput = reorderDataFallback(input, output, context); + if (fallbackOutput) { + return *fallbackOutput; + } + } + // if precision conversion does not work then do direct reference reorder + if (directReorder) { + dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order); + directReorder.execute(loc_stream, + {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); + } else { + OPENVINO_THROW("Could not make onednn reorder."); + } + return output; +} + +MemoryPtr acl_fc_executor::reorderWeights(const MemoryArgs& memory, + const ExecutorContext::CPtr context, + ACLFCAttrs& aclfcAttrs, + DnnlMemoryDescPtr dnnlSrcDesc, + DnnlMemoryDescPtr dnnlDstDesc) { + auto create = [&]() { + MemoryPtr weightsMemory = memory.at(ARG_WEI); + if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) { + weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context); + DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); + } + return weightsMemory; + }; + + auto weightCache = context->getWeightsCache(); + if (weightCache != nullptr) { + const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); + const auto N = wgtDims[0]; + const auto K = wgtDims[1]; + std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); + const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + + std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); + DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash); + return *weightCache->findOrCreate(string_hash, create); + } + + DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available"); + return create(); +} + +MemoryPtr acl_fc_executor::prepareWeightMemory(const MemoryArgs& memory, + const ExecutorContext::CPtr context, + const FCAttrs& attrs, + ACLFCAttrs& aclfcAttrs, + const PostOps& postOps, + arm_compute::WeightFormat& expectedWeightFormat, + arm_compute::TensorInfo& weiTensorInfo) { + MemoryArgs memoryArgs; + memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); + memoryArgs[ARG_WEI] = memory.at(ARG_WEI); + + auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr(); + // normalize weights to 2D + const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims(); + const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims); + originalWeightsDesc = std::make_shared(originalWeightsDesc->getPrecision(), Shape{wgtDims2D}); + auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc); + auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(aclfcAttrs.inputPrecision); + auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); + + if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { + const auto& inShape = memory.at(ARG_SRC_0)->getShape(); + const auto& wShape = originalWeightsDesc->getShape(); + const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); + const auto& outDymmyDims = + makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); + memoryArgs[ARG_SRC_0] = + std::make_shared(context->getEngine(), + memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); + memoryArgs[ARG_DST] = + std::make_shared(context->getEngine(), + memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); + } else { + memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); + memoryArgs[ARG_DST] = memory.at(ARG_DST); + } + // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor + // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability + auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); + bool isNeededReorder = aclWeightsRepack->update(memoryArgs); + expectedWeightFormat = + isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; + weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI); + + if (isNeededReorder) { + dnnl::impl::dim_t o_dim = 0; + dnnl::impl::dim_t inner_dim = 1; + std::vector remaining_dims = {}; + auto weights_md_ = dnnlDstDesc->getDnnlDesc().get(); + dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, + *weights_md_, + expectedWeightFormat, + inner_dim, + o_dim, + remaining_dims, + {}); + if (aclfcAttrs.weightsNonTransposed) { + dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc); + } + aclfcAttrs.isWeightsRepacked = true; + return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); + } + if (!aclfcAttrs.weightsNonTransposed) { + dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc); + aclfcAttrs.isWeightsRepacked = true; + } + return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); +} + +static bool checkPostOps(const PostOps& postOps) { + // Add postops + if (!postOps.empty() && postOps.size() == 1) { + if (const auto activation = std::dynamic_pointer_cast(postOps[0])) { + if (checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()))) { + return true; + } + } + } + return false; +} + +static void initFCAttrs(const FCAttrs& attrs, + ACLTensorAttrs& aclTensorAttrs, + ACLFCAttrs& aclfcAttrs, + const MemoryArgs& memory, + arm_compute::FullyConnectedLayerInfo& fullyConnectedLayerInfo, + const PostOps& postOps) { + aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); + fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); + aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); + fullyConnectedLayerInfo.transpose_weights = false; + aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed; + + if (checkPostOps(postOps)) { + auto activation = std::dynamic_pointer_cast(postOps[0]); + fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), + activation->beta(), + activation->gamma()); + } + + if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { + aclfcAttrs.isConvertedWeights = true; + } +} + +arm_compute::TensorShape acl_fc_executor::normalizeDimsTo2D(const arm_compute::TensorShape shape) { + size_t norm_dim = std::accumulate(shape.begin() + 1, shape.end(), 1, std::multiplies()); + return arm_compute::TensorShape(shape[0], norm_dim); +} + +void acl_fc_executor::updateFCTensorsShapes(ACLShapes& aclMemoryShapes) { + aclMemoryShapes[ACLArgs::ACL_WEI] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_WEI]); + aclMemoryShapes[ACLArgs::ACL_SRC_0] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_SRC_0]); + aclMemoryShapes[ACLArgs::ACL_DST] = normalizeDimsTo2D(aclMemoryShapes[ACLArgs::ACL_DST]); + std::swap(aclMemoryShapes[ACLArgs::ACL_WEI][0], aclMemoryShapes[ACLArgs::ACL_WEI][1]); +} + +arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos& aclMemoryInfos) { + return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + arm_compute::ConvertPolicy::SATURATE); +} + +ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors& aclMemoryTensors) { + auto neCast = std::make_unique(); + neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get(), + arm_compute::ConvertPolicy::SATURATE); + return neCast; +} + +acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory) { + initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); +} + +void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes& aclMemoryShapes) { + updateFCTensorsShapes(aclMemoryShapes); +} + +arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos& aclMemoryInfos) { + if (aclfcAttrs.isConvertedWeights) { + aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); + } + int icTotal = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0); + return arm_compute::NEFullyConnectedLayer::has_opt_impl( + expectedWeightFormat, + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_WEI].get(), + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + arm_compute::WeightsInfo(false, 1, 1, icTotal, false, arm_compute::WeightFormat::ANY)); +} + +ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors& aclMemoryTensors) { + return std::make_unique(); +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp new file mode 100644 index 00000000000000..686042f6067433 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected_utils.hpp @@ -0,0 +1,81 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once +#include "acl_common_executor.hpp" +#include "nodes/executors/fullyconnected_config.hpp" +#include "ov_optional.hpp" + +namespace ov { +namespace intel_cpu { + +struct ACLFCAttrs { + ov::element::Type inputPrecision; + bool isConvertedWeights = false; + bool isWeightsRepacked = false; + bool weightsNonTransposed; +}; + +namespace acl_fc_executor { + +VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape); + +VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank); + +DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, const DnnlMemoryDescPtr dstDesc); + +ov::optional convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision); + +ov::optional reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context); + +MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc, + DnnlMemoryDescPtr dstWeightDesc, + MemoryCPtr weightsMem, + ExecutorContext::CPtr context); + +MemoryPtr reorderWeights(const MemoryArgs& memory, + const ExecutorContext::CPtr context, + ACLFCAttrs& aclfcAttrs, + DnnlMemoryDescPtr dnnlSrcDesc, + DnnlMemoryDescPtr dnnlDstDesc); + +MemoryPtr prepareWeightMemory(const MemoryArgs& memory, + const ExecutorContext::CPtr context, + const FCAttrs& attrs, + ACLFCAttrs& aclfcAttrs, + const PostOps& postOps, + arm_compute::WeightFormat& expectedWeightFormat, + arm_compute::TensorInfo& weiTensorInfo); + +arm_compute::TensorShape normalizeDimsTo2D(const arm_compute::TensorShape shape); + +void updateFCTensorsShapes(ACLShapes& aclMemoryShapes); + +class ACLWeightsConverter : public ACLCommonExecutor { +public: + ACLWeightsConverter() = default; + void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} + arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override; +}; + +class ACLWeightFormatGenerator : public ACLCommonExecutor { +public: + ACLWeightFormatGenerator(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory); + void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; + arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override; + arm_compute::WeightFormat getOptImplWeightFormat() { + return expectedWeightFormat; + } + +private: + arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; + arm_compute::WeightsInfo weightsInfo; + ACLFCAttrs aclfcAttrs; + arm_compute::WeightFormat expectedWeightFormat; +}; + +} // namespace acl_fc_executor +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp index b6fa129974107f..2af982024b5637 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp @@ -22,23 +22,23 @@ unsigned int ACLScheduler::num_threads() const { void ACLScheduler::set_num_threads(unsigned int num_threads) {} -void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) { - const Window & max_window = window; +void ACLScheduler::schedule_custom(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) { + const Window& max_window = window; const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); #if OV_THREAD == OV_THREAD_OMP - //In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section - //This is the reason why this method isn't used to initialize _num_threads + // In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section + // This is the reason why this method isn't used to initialize _num_threads const auto _num_threads = num_iterations; #else const auto _num_threads = std::min(num_iterations, static_cast(parallel_get_num_threads())); #endif - std::function main_run; + std::function main_run; if (tensors.empty()) { - main_run = [&](const Window &window, const ThreadInfo &info) { + main_run = [&](const Window& window, const ThreadInfo& info) { kernel->run(window, info); }; } else { - main_run = [&](const Window &window, const ThreadInfo &info) { + main_run = [&](const Window& window, const ThreadInfo& info) { kernel->run_op(tensors, window, info); }; } @@ -59,20 +59,20 @@ void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const } } -void ACLScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { +void ACLScheduler::schedule(ICPPKernel* kernel, const Hints& hints) { ITensorPack tensors; schedule_custom(kernel, hints, kernel->window(), tensors); } -void ACLScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) { +void ACLScheduler::schedule_op(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) { schedule_custom(kernel, hints, window, tensors); } -void ACLScheduler::run_workloads(std::vector &workloads) { +void ACLScheduler::run_workloads(std::vector& workloads) { ov::parallel_for(workloads.size(), [&](int wid) { workloads[wid]({wid, static_cast(parallel_get_num_threads()), &cpu_info()}); }); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp index 1148f4ad5edd69..c94f0aa3abce3a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp @@ -4,9 +4,10 @@ #pragma once -#include #include #include +#include + #include "support/Mutex.h" namespace ov { @@ -20,12 +21,14 @@ class ACLScheduler final : public IScheduler { ~ACLScheduler() override = default; std::uint32_t num_threads() const override; void set_num_threads(unsigned int num_threads) override; - void schedule(ICPPKernel *kernel, const Hints &hints) override; - void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override; + void schedule(ICPPKernel* kernel, const Hints& hints) override; + void schedule_op(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) override; + protected: - void run_workloads(std::vector &workloads) override; + void run_workloads(std::vector& workloads) override; + private: - void schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors); + void schedule_custom(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors); }; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp index 33bd49e2f04d9b..077759193d1c30 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp @@ -3,13 +3,14 @@ // #include "acl_interpolate.hpp" + #include "acl_utils.hpp" #include "utils/debug_capabilities.h" -bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpolateAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { aclInterpolateAttrs = interpolateAttrs; InterpolateExecutor::init(aclInterpolateAttrs, srcDescs, dstDescs, attr); acl_coord = arm_compute::SamplingPolicy::TOP_LEFT; @@ -17,22 +18,23 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo static const size_t index_h = 2; static const size_t index_w = 3; - if ((aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::pytorch_half_pixel && out_shape[index_h] > 1 && out_shape[index_w] > 1) || + if ((aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::pytorch_half_pixel && + out_shape[index_h] > 1 && out_shape[index_w] > 1) || aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::half_pixel) { acl_coord = arm_compute::SamplingPolicy::CENTER; } switch (aclInterpolateAttrs.mode) { - case InterpolateMode::linear: - case InterpolateMode::linear_onnx: - acl_policy = arm_compute::InterpolationPolicy::BILINEAR; - break; - case InterpolateMode::nearest: - acl_policy = arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR; - break; - default: - DEBUG_LOG("Unsupported interpolate mode: ", static_cast(aclInterpolateAttrs.mode)); - return false; + case InterpolateMode::linear: + case InterpolateMode::linear_onnx: + acl_policy = arm_compute::InterpolationPolicy::BILINEAR; + break; + case InterpolateMode::nearest: + acl_policy = arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR; + break; + default: + DEBUG_LOG("Unsupported interpolate mode: ", static_cast(aclInterpolateAttrs.mode)); + return false; } auto srcDims = shapeCast(srcDescs[0]->getShape().getDims()); @@ -42,22 +44,25 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo changeLayoutToNH_C({&srcDims, &dstDims}); } - auto srcTensorInfo = arm_compute::TensorInfo(srcDims, 1, + auto srcTensorInfo = arm_compute::TensorInfo(srcDims, + 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - auto dstTensorInfo = arm_compute::TensorInfo(dstDims, 1, + auto dstTensorInfo = arm_compute::TensorInfo(dstDims, + 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); - arm_compute::Status status = arm_compute::NEScale::validate(&srcTensorInfo, - &dstTensorInfo, - arm_compute::ScaleKernelInfo(acl_policy, - arm_compute::BorderMode::REPLICATE, - arm_compute::PixelValue(), - acl_coord, - false, - aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners, - getAclDataLayoutByMemoryDesc(srcDescs[0]))); + arm_compute::Status status = arm_compute::NEScale::validate( + &srcTensorInfo, + &dstTensorInfo, + arm_compute::ScaleKernelInfo(acl_policy, + arm_compute::BorderMode::REPLICATE, + arm_compute::PixelValue(), + acl_coord, + false, + aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners, + getAclDataLayoutByMemoryDesc(srcDescs[0]))); if (!status) { DEBUG_LOG("NEScale validation failed: ", status.error_description()); return false; @@ -68,21 +73,25 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo acl_scale = std::make_unique(); configureThreadSafe([&] { - acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy, - arm_compute::BorderMode::REPLICATE, - arm_compute::PixelValue(), - acl_coord, - false, - aclInterpolateAttrs.coordTransMode == - InterpolateCoordTransMode::align_corners, - getAclDataLayoutByMemoryDesc(srcDescs[0]))); + acl_scale->configure( + &srcTensor, + &dstTensor, + arm_compute::ScaleKernelInfo(acl_policy, + arm_compute::BorderMode::REPLICATE, + arm_compute::PixelValue(), + acl_coord, + false, + aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners, + getAclDataLayoutByMemoryDesc(srcDescs[0]))); }); return true; } -void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { auto in_ptr_ = padPreprocess(src, dst); - srcTensor.allocator()->import_memory(const_cast(reinterpret_cast(in_ptr_))); + srcTensor.allocator()->import_memory(const_cast(reinterpret_cast(in_ptr_))); dstTensor.allocator()->import_memory(dst[0]->getData()); acl_scale->run(); @@ -92,8 +101,9 @@ void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector& } bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( - const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, const std::vector &srcDescs, - const std::vector &dstDescs) { + const ov::intel_cpu::InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) { OPENVINO_ASSERT(srcDescs[0]->getShape().getDims().size() == 4); auto& inp_shape = srcDescs[0]->getShape().getDims(); @@ -116,7 +126,8 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( if (coord_mode == InterpolateCoordTransMode::half_pixel && (nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::round_prefer_ceil)) { - DEBUG_LOG("InterpolateCoordTransMode half_pixel is not supported for InterpolateNearestMode simple and round_prefer_ceil"); + DEBUG_LOG("InterpolateCoordTransMode half_pixel is not supported for InterpolateNearestMode simple and " + "round_prefer_ceil"); return false; } @@ -129,15 +140,17 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( if (is_upsample) { bool int_factor = scale_h == static_cast(scale_h) && scale_w == static_cast(scale_w); if (int_factor && coord_mode != InterpolateCoordTransMode::asymmetric && - (nearest_mode == InterpolateNearestMode::round_prefer_ceil - || nearest_mode == InterpolateNearestMode::round_prefer_floor)) { - DEBUG_LOG("upsample && int_factor && !asymmetric && (round_prefer_ceil || round_prefer_floor) case is supported"); + (nearest_mode == InterpolateNearestMode::round_prefer_ceil || + nearest_mode == InterpolateNearestMode::round_prefer_floor)) { + DEBUG_LOG( + "upsample && int_factor && !asymmetric && (round_prefer_ceil || round_prefer_floor) case is supported"); return true; } } else if (scale_h < 1 && scale_w < 1) { float down_scale_h = static_cast(inp_shape[index_h]) / out_shape[index_h]; float down_scale_w = static_cast(inp_shape[index_w]) / out_shape[index_w]; - bool int_factor = down_scale_h == static_cast(down_scale_h) && down_scale_w == static_cast(down_scale_w); + bool int_factor = + down_scale_h == static_cast(down_scale_h) && down_scale_w == static_cast(down_scale_w); if (int_factor && coord_mode != InterpolateCoordTransMode::align_corners && nearest_mode == InterpolateNearestMode::simple) { @@ -146,29 +159,45 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( } if (int_factor && nearest_mode == InterpolateNearestMode::round_prefer_ceil && - ((out_shape[index_h] > 1 && out_shape[index_w] > 1) || coord_mode != InterpolateCoordTransMode::half_pixel)) { - DEBUG_LOG("!upsample && int_factor && round_prefer_ceil && (out_shape > 1 || half_pixel) case is supported"); + ((out_shape[index_h] > 1 && out_shape[index_w] > 1) || + coord_mode != InterpolateCoordTransMode::half_pixel)) { + DEBUG_LOG( + "!upsample && int_factor && round_prefer_ceil && (out_shape > 1 || half_pixel) case is supported"); return true; } } - DEBUG_LOG("ACL Interpolate executor does not support such configuration: coord_mode=", static_cast(coord_mode), - " nearest_mode=", static_cast(nearest_mode), " upsample=", is_upsample, " scale_h=", scale_h, " scale_w=", scale_w); + DEBUG_LOG("ACL Interpolate executor does not support such configuration: coord_mode=", + static_cast(coord_mode), + " nearest_mode=", + static_cast(nearest_mode), + " upsample=", + is_upsample, + " scale_h=", + scale_h, + " scale_w=", + scale_w); return false; } -bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs) const { +bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_cpu::InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const { if (srcDescs[0]->getShape().getDims().size() != 4u) { DEBUG_LOG("ACL Interpolate does not support src shape rank: ", srcDescs[0]->getShape().getDims().size()); return false; } auto& pads_begin = interpolateAttrs.padBegin; - auto& pads_end = interpolateAttrs.padEnd; - - if (!std::all_of(pads_begin.begin(), pads_begin.end(), [](int i){return i == 0;}) || - !std::all_of(pads_end.begin(), pads_end.end(), [](int i){return i == 0;})) { + auto& pads_end = interpolateAttrs.padEnd; + + if (!std::all_of(pads_begin.begin(), + pads_begin.end(), + [](int i) { + return i == 0; + }) || + !std::all_of(pads_end.begin(), pads_end.end(), [](int i) { + return i == 0; + })) { DEBUG_LOG("ACL Interpolate does not support padding"); return false; } @@ -180,15 +209,16 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_c return false; } - if (interpolateAttrs.mode == InterpolateMode::cubic || - interpolateAttrs.mode == InterpolateMode::bilinear_pillow || + if (interpolateAttrs.mode == InterpolateMode::cubic || interpolateAttrs.mode == InterpolateMode::bilinear_pillow || interpolateAttrs.mode == InterpolateMode::bicubic_pillow) { DEBUG_LOG("ACL Interpolate does not support cubic, bilinear_pillow, bicubic_pillow modes"); return false; } if (interpolateAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales && - one_of(interpolateAttrs.coordTransMode, InterpolateCoordTransMode::half_pixel, InterpolateCoordTransMode::asymmetric) && + one_of(interpolateAttrs.coordTransMode, + InterpolateCoordTransMode::half_pixel, + InterpolateCoordTransMode::asymmetric) && one_of(interpolateAttrs.mode, InterpolateMode::linear, InterpolateMode::linear_onnx)) { DEBUG_LOG("ACL Interpolate does not support scales mode with linear/linear_onnx and half_pixel/asymmetric"); return false; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp index 17cdfec5928544..c141fa132a31ff 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp @@ -4,9 +4,9 @@ #pragma once -#include "nodes/executors/interpolate.hpp" #include "arm_compute/runtime/NEON/functions/NEScale.h" #include "arm_compute/runtime/Tensor.h" +#include "nodes/executors/interpolate.hpp" namespace ov { namespace intel_cpu { @@ -18,9 +18,11 @@ class ACLInterpolateExecutor : public InterpolateExecutor { bool init(const InterpolateAttrs& interpolateAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; - void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) override; + void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -44,10 +46,11 @@ class ACLInterpolateExecutorBuilder : public InterpolateExecutorBuilder { InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override { return std::make_shared(context); } + private: static bool isSupportedConfiguration(const InterpolateAttrs& interpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs); + const std::vector& srcDescs, + const std::vector& dstDescs); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp new file mode 100644 index 00000000000000..1604c4fff2f585 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_lowp_fullyconnected.hpp" + +#include "acl_fullyconnected_utils.hpp" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "memory_desc/cpu_memory_desc_utils.h" +#include "nodes/common/cpu_convert.h" +#include "nodes/executors/acl/acl_utils.hpp" +#include "nodes/executors/common/common_utils.hpp" +#include "nodes/executors/debug_messages.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/implementation_utils.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "utils/debug_capabilities.h" + +namespace ov { +namespace intel_cpu { + +static bool checkPostOps(const PostOps& postOps) { + if (postOps.empty()) { + return true; + } + + if (postOps.size() != 1) { + return false; + } + + const auto activation = std::dynamic_pointer_cast(postOps[0]); + return checkActivationLayerInfo(convertToEltwiseAlgorithm(activation->type())); +} + +static void initFCAttrs(const FCAttrs& attrs, + ACLTensorAttrs& aclTensorAttrs, + ACLFCAttrs& aclfcAttrs, + const MemoryArgs& memory, + arm_compute::GEMMInfo& fullyConnectedLayerInfo, + const PostOps& postOps) { + aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); + aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); + aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed; + + if (!postOps.empty()) { + auto activation = std::dynamic_pointer_cast(postOps[0]); + fullyConnectedLayerInfo.set_activation_info( + getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), + activation->beta(), + activation->gamma())); + } + + if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { + aclfcAttrs.isConvertedWeights = true; + } +} + +ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr& context) { + dequantizationScales = getDeQuantizedScales(memory); + initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps); + packedWeights = acl_fc_executor::prepareWeightMemory(memory, + context, + attrs, + aclfcAttrs, + postOps, + expectedWeightFormat, + weiTensorInfo); +} + +bool ACLLowpFullyConnectedExecutor::supports(const FCConfig& config) { + const auto src0 = srcType(config); + const auto src1 = weiType(config); + const auto dst = dstType(config); + if ((src0 != ov::element::i8) || (src1 != ov::element::i8) || (dst != ov::element::f32)) { + return false; + } + + VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS); + VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); + VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK); + return true; +} + +void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) { + acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes); +} + +arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLInfos& aclMemoryInfos) { + auto& tensor_info = aclMemoryInfos[ACLArgs::ACL_SRC_0]; + if (dequantizationScales.empty()) { + tensor_info->set_quantization_info(arm_compute::QuantizationInfo(1.f)); + } else { + tensor_info->set_quantization_info(arm_compute::QuantizationInfo(dequantizationScales[0])); + } + + auto& tensor_info_weights = aclMemoryInfos[ACLArgs::ACL_WEI]; + tensor_info_weights->set_quantization_info(arm_compute::QuantizationInfo(1.f)); + + const auto matMulValid = + arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_WEI].get(), + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + gemmInfo); + return matMulValid; +} + +ACLFunction ACLLowpFullyConnectedExecutor::configureFunction(const ACLTensors& aclMemoryTensors) { + auto gemm = std::make_unique(); + gemm->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_WEI].get(), + aclMemoryTensors[ACLArgs::ACL_BIAS].get(), + aclMemoryTensors.at(ACLArgs::ACL_DST).get(), + gemmInfo); + + if (aclfcAttrs.isConvertedWeights || !aclfcAttrs.weightsNonTransposed) { + aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false; + aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->import_memory(packedWeights->getData()); + } + return gemm; +} + +std::shared_ptr ACLLowpFullyConnectedExecutor::initTensorInfo( + const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) { + arm_compute::DataType result; + switch (dataType) { + case arm_compute::DataType::S8: { + result = arm_compute::DataType::QASYMM8_SIGNED; + break; + } + case arm_compute::DataType::U8: { + result = arm_compute::DataType::QASYMM8; + break; + } + default: { + result = dataType; + break; + } + } + + return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp new file mode 100644 index 00000000000000..3912328077df63 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_lowp_fullyconnected.hpp @@ -0,0 +1,51 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "acl_common_executor.hpp" +#include "acl_fullyconnected_utils.hpp" +#include "nodes/executors/fullyconnected_config.hpp" + +namespace ov { +namespace intel_cpu { + +class ACLLowpFullyConnectedExecutor : public ACLCommonExecutor { +public: + ACLLowpFullyConnectedExecutor(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr& context); + + static bool supports(const FCConfig& config); + + void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; + + arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override; + + ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override; + + impl_desc_type implType() const override { + return impl_desc_type::gemm_acl; + } + +protected: + std::shared_ptr initTensorInfo(const arm_compute::TensorShape& tensorShape, + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) override; + +private: + arm_compute::GEMMInfo gemmInfo; + arm_compute::WeightFormat expectedWeightFormat; + arm_compute::TensorInfo weiTensorInfo; + + MemoryCPtr packedWeights; + ACLFCAttrs aclfcAttrs; + std::vector dequantizationScales; +}; + +using ACLLowpFullyConnectedExecutorPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp index 6fde4bb0db5604..290cd3c9dbcce9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp @@ -14,7 +14,7 @@ AclMVNExecutor::AclMVNExecutor(const ExecutorContext::CPtr context) : MVNExecuto bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const dnnl::primitive_attr& attr) { auto srcDims = srcDescs[0]->getShape().getStaticDims(); auto dstDims = dstDescs[0]->getShape().getStaticDims(); @@ -46,9 +46,14 @@ bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs, } } - TensorInfo srcTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo dstTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); - + TensorInfo srcTensorInfo = TensorInfo(TensorShape(X, Y), + 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[0])); + TensorInfo dstTensorInfo = TensorInfo(TensorShape(X, Y), + 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(dstDescs[0])); if (!arm_compute::NEMeanStdDevNormalizationLayer::validate(&srcTensorInfo, &dstTensorInfo, mvnAttrs.epsValue_)) return false; @@ -57,12 +62,16 @@ bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs, dstTensor.allocator()->init(dstTensorInfo); mvn = std::make_unique(); - configureThreadSafe([&] { mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_); }); + configureThreadSafe([&] { + mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_); + }); return true; } -void AclMVNExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void AclMVNExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); @@ -75,41 +84,41 @@ void AclMVNExecutor::exec(const std::vector& src, const std::vector< bool AclMVNExecutorBuilder::isSupported(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const { - if ((srcDescs[0]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f16) || - srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision()) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); - return false; - } - - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support layout:", - " src: ", srcDescs[0]->serializeFormat(), - " dst: ", dstDescs[0]->serializeFormat()); - return false; - } + if ((srcDescs[0]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f16) || + srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision()) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support precisions:", + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); + return false; + } - if (mvnAttrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support OUTSIDE_SQRT mode"); - return false; - } - if (!mvnAttrs.normalizeVariance_) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer supports normalize_variance=true only"); - return false; - } - if (!mvnAttrs.initAcrossChannels_ && - srcDescs[0]->hasLayoutType(LayoutType::nspc)) { - DEBUG_LOG("initAcrossChannels = false is not supported by ACL for NHWC layout"); - return false; - } + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support layout:", + " src: ", + srcDescs[0]->serializeFormat(), + " dst: ", + dstDescs[0]->serializeFormat()); + return false; + } - return true; + if (mvnAttrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support OUTSIDE_SQRT mode"); + return false; } + if (!mvnAttrs.normalizeVariance_) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer supports normalize_variance=true only"); + return false; + } + if (!mvnAttrs.initAcrossChannels_ && srcDescs[0]->hasLayoutType(LayoutType::nspc)) { + DEBUG_LOG("initAcrossChannels = false is not supported by ACL for NHWC layout"); + return false; + } + + return true; +} -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp index 7ba445253f8d02..02521551509366 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp @@ -5,8 +5,8 @@ #pragma once #include "acl_utils.hpp" -#include "nodes/executors/mvn.hpp" #include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/mvn.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -19,10 +19,10 @@ class AclMVNExecutor : public MVNExecutor { bool init(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -47,5 +47,5 @@ class AclMVNExecutorBuilder : public MVNExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp index a49a5cea4ef26e..2e4aed30d7b33e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp @@ -3,6 +3,7 @@ // #include "acl_pooling.hpp" + #include "acl_utils.hpp" namespace ov { @@ -22,10 +23,12 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo, PoolingLayerInfo* pool_info, Pooling3dLayerInfo* pool3d_info, bool ignoreOutShapeErrors) { - unsigned int pad_left = (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[1] : poolingAttrs.data_pad_begin[0]; - unsigned int pad_right = (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[1] : poolingAttrs.data_pad_end[0]; - unsigned int pad_top = (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[0] : 0; - unsigned int pad_bottom = (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[0] : 0; + unsigned int pad_left = + (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[1] : poolingAttrs.data_pad_begin[0]; + unsigned int pad_right = + (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[1] : poolingAttrs.data_pad_end[0]; + unsigned int pad_top = (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[0] : 0; + unsigned int pad_bottom = (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[0] : 0; unsigned int kernel_w = (poolingAttrs.kernel.size() >= 2u) ? poolingAttrs.kernel[1] : poolingAttrs.kernel[0]; unsigned int kernel_h = (poolingAttrs.kernel.size() >= 2u) ? poolingAttrs.kernel[0] : 1; unsigned int stride_x = (poolingAttrs.stride.size() >= 2u) ? poolingAttrs.stride[1] : poolingAttrs.stride[0]; @@ -47,45 +50,48 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo, // The combination of parameters: NCHW + CEIL gives an accuracy problem in AvgPool. // One workaround is to disable the ACL executor for these parameters. // Then OneDNN will run this case in ACL backend as reorder -> NHWC -> reorder - if (pool_type == PoolingType::AVG && - dataLayout == arm_compute::DataLayout::NCHW && + if (pool_type == PoolingType::AVG && dataLayout == arm_compute::DataLayout::NCHW && poolingAttrs.rounding == op::RoundingType::CEIL) { DEBUG_LOG("NCHW + CEIL gives an accuracy problem in ACL AvgPool. ACL executor will not be created."); return false; } - DimensionRoundingType round = (poolingAttrs.rounding == op::RoundingType::CEIL) ? - DimensionRoundingType::CEIL : DimensionRoundingType::FLOOR; + DimensionRoundingType round = + (poolingAttrs.rounding == op::RoundingType::CEIL) ? DimensionRoundingType::CEIL : DimensionRoundingType::FLOOR; if (srcDimsSize == 5) { if (dstDescsSize > 1) { DEBUG_LOG("NEPooling3dLayer does not support indices"); return false; } else { - unsigned int kernel_d = poolingAttrs.kernel[2]; - unsigned int stride_z = poolingAttrs.stride[2]; + unsigned int kernel_d = poolingAttrs.kernel[2]; + unsigned int stride_z = poolingAttrs.stride[2]; unsigned int pad_front = poolingAttrs.data_pad_begin[2]; - unsigned int pad_back = poolingAttrs.data_pad_end[2]; - pool3d_info->pool_type = pool_type; + unsigned int pad_back = poolingAttrs.data_pad_end[2]; + pool3d_info->pool_type = pool_type; pool3d_info->exclude_padding = exclude_padding; - pool3d_info->pool_size = arm_compute::Size3D(kernel_w, kernel_h, kernel_d); - pool3d_info->stride = arm_compute::Size3D(stride_x, stride_y, stride_z); - pool3d_info->padding = arm_compute::Padding3D(pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back); - pool3d_info->round_type = round; - arm_compute::Status s = arm_compute::NEPooling3dLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool3d_info); + pool3d_info->pool_size = arm_compute::Size3D(kernel_w, kernel_h, kernel_d); + pool3d_info->stride = arm_compute::Size3D(stride_x, stride_y, stride_z); + pool3d_info->padding = + arm_compute::Padding3D(pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back); + pool3d_info->round_type = round; + arm_compute::Status s = + arm_compute::NEPooling3dLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool3d_info); if (!s) { DEBUG_LOG("NEPooling3dLayer validation failed: ", s.error_description()); return false; } } } else { - pool_info->data_layout = dataLayout; - pool_info->pool_size = arm_compute::Size2D(kernel_w, kernel_h); - pool_info->pad_stride_info = arm_compute::PadStrideInfo(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, round); - pool_info->pool_type = pool_type; - pool_info->exclude_padding = exclude_padding; + pool_info->data_layout = dataLayout; + pool_info->pool_size = arm_compute::Size2D(kernel_w, kernel_h); + pool_info->pad_stride_info = + arm_compute::PadStrideInfo(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, round); + pool_info->pool_type = pool_type; + pool_info->exclude_padding = exclude_padding; if (dstDescsSize > 1) { TensorInfo indTensorInfo = TensorInfo(shapeCast(*indDims), 1, arm_compute::DataType::U32, dataLayout); - arm_compute::Status s = arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info, &indTensorInfo); + arm_compute::Status s = + arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info, &indTensorInfo); if (!s) { DEBUG_LOG("NEPoolingLayer validation with indices failed: ", s.error_description()); if (ignoreOutShapeErrors && @@ -112,9 +118,9 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo, } bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto srcDims = srcDescs[0]->getShape().getStaticDims(); auto dstDims = dstDescs[0]->getShape().getStaticDims(); @@ -124,10 +130,14 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, changeLayoutToNH_C({&srcShape, &dstShape}); } - TensorInfo srcTensorInfo = TensorInfo(srcShape, 1, - precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo dstTensorInfo = TensorInfo(dstShape, 1, - precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); + TensorInfo srcTensorInfo = TensorInfo(srcShape, + 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[0])); + TensorInfo dstTensorInfo = TensorInfo(dstShape, + 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(dstDescs[0])); srcTensor.allocator()->init(srcTensorInfo); dstTensor.allocator()->init(dstTensorInfo); @@ -166,7 +176,9 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, nullptr)) return false; auto indDims = dstDescs[1]->getShape().getStaticDims(); - TensorInfo indTensorInfo = TensorInfo(shapeCast(indDims), 1, precisionToAclDataType(dstDescs[1]->getPrecision()), + TensorInfo indTensorInfo = TensorInfo(shapeCast(indDims), + 1, + precisionToAclDataType(dstDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[1])); indTensor.allocator()->init(indTensorInfo); exec_func = [this, pool_info]() -> std::unique_ptr { @@ -192,21 +204,27 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, }; } } - configureThreadSafe([&] { ifunc = exec_func(); }); + configureThreadSafe([&] { + ifunc = exec_func(); + }); return true; } -void AclPoolingExecutor::exec(const std::vector& src, const std::vector& dst, std::unordered_map postOpsArgs) { +void AclPoolingExecutor::exec(const std::vector& src, + const std::vector& dst, + std::unordered_map postOpsArgs) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); - if (dst.size() > 1u) indTensor.allocator()->import_memory(dst[1]->getData()); + if (dst.size() > 1u) + indTensor.allocator()->import_memory(dst[1]->getData()); ifunc->run(); srcTensor.allocator()->free(); dstTensor.allocator()->free(); - if (dst.size() > 1u) indTensor.allocator()->free(); + if (dst.size() > 1u) + indTensor.allocator()->free(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp index 9f6b1bb0fcc668..75b3d28eecf4aa 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp @@ -4,8 +4,8 @@ #pragma once -#include "nodes/executors/pooling.hpp" #include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/pooling.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -18,7 +18,7 @@ class AclPoolingExecutor : public PoolingExecutor { bool init(const PoolingAttrs& poolingAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, std::unordered_map postOpsArgs) override; @@ -54,70 +54,72 @@ class AclPoolingExecutorBuilder : public PoolingExecutorBuilder { bool isSupported(const PoolingAttrs& poolingAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const override { - if ((srcDescs[0]->getPrecision() != ov::element::f32 && - dstDescs[0]->getPrecision() != ov::element::f32) && - (srcDescs[0]->getPrecision() != ov::element::f16 && - dstDescs[0]->getPrecision() != ov::element::f16)) { + if ((srcDescs[0]->getPrecision() != ov::element::f32 && dstDescs[0]->getPrecision() != ov::element::f32) && + (srcDescs[0]->getPrecision() != ov::element::f16 && dstDescs[0]->getPrecision() != ov::element::f16)) { DEBUG_LOG("AclPoolingExecutor does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } if (srcDescs.size() == 2u && - (srcDescs[1]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f32 && + (srcDescs[1]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f32 && dstDescs[0]->getPrecision() != ov::element::f32) && - (srcDescs[1]->getPrecision() != ov::element::f16 && - srcDescs[0]->getPrecision() != ov::element::f16 && + (srcDescs[1]->getPrecision() != ov::element::f16 && srcDescs[0]->getPrecision() != ov::element::f16 && dstDescs[0]->getPrecision() != ov::element::f16)) { DEBUG_LOG("AclPoolingExecutor does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " src[1]=", srcDescs[1]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " src[1]=", + srcDescs[1]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } - if (dstDescs.size() == 2u && - dstDescs[1]->getPrecision() != ov::element::u32) { + if (dstDescs.size() == 2u && dstDescs[1]->getPrecision() != ov::element::u32) { DEBUG_LOG("AclPoolingExecutor supports U32 as indices precisions only. ", - "Passed indices precision: ", dstDescs[1]->getPrecision()); - return false; - } + "Passed indices precision: ", + dstDescs[1]->getPrecision()); + return false; + } if (srcDescs[0]->getShape().getRank() < 5) { - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("NEPoolingLayer does not support layouts:", - " src=", srcDescs[0]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); - return false; - } + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("NEPoolingLayer does not support layouts:", + " src=", + srcDescs[0]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); + return false; + } if (srcDescs.size() == 2u && - !(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - srcDescs[1]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - srcDescs[1]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("NEPoolingLayer does not support layouts:", - " src[0]=", srcDescs[0]->serializeFormat(), - " src[1]=", srcDescs[1]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); - return false; - } + !(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && srcDescs[1]->hasLayoutType(LayoutType::ncsp) && + dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) && + dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("NEPoolingLayer does not support layouts:", + " src[0]=", + srcDescs[0]->serializeFormat(), + " src[1]=", + srcDescs[1]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); + return false; + } } else { - if (!(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("Pooling3dLayer does not support layouts:", - " src=", srcDescs[0]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); - return false; - } + if (!(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("Pooling3dLayer does not support layouts:", + " src=", + srcDescs[0]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); + return false; + } } return true; @@ -128,5 +130,5 @@ class AclPoolingExecutorBuilder : public PoolingExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp index e99747121cb623..5973027a0376cb 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp @@ -11,28 +11,31 @@ using namespace arm_compute; static arm_compute::ReductionOperation getAclReductionOperationByAlgorithm(Algorithm algorithm) { switch (algorithm) { - case Algorithm::ReduceMax: return arm_compute::ReductionOperation::MAX; - case Algorithm::ReduceMin: return arm_compute::ReductionOperation::MIN; - case Algorithm::ReduceSum: return arm_compute::ReductionOperation::SUM; - case Algorithm::ReduceProd: return arm_compute::ReductionOperation::PROD; - default: OPENVINO_THROW("Unsupported reduction operation: ", static_cast(algorithm)); + case Algorithm::ReduceMax: + return arm_compute::ReductionOperation::MAX; + case Algorithm::ReduceMin: + return arm_compute::ReductionOperation::MIN; + case Algorithm::ReduceSum: + return arm_compute::ReductionOperation::SUM; + case Algorithm::ReduceProd: + return arm_compute::ReductionOperation::PROD; + default: + OPENVINO_THROW("Unsupported reduction operation: ", static_cast(algorithm)); } } AclReduceExecutor::AclReduceExecutor(const ExecutorContext::CPtr context) : ReduceExecutor(context) {} bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { - if (reduceAttrs.operation != Algorithm::ReduceMax && - reduceAttrs.operation != Algorithm::ReduceMin && - reduceAttrs.operation != Algorithm::ReduceSum && - reduceAttrs.operation != Algorithm::ReduceProd && + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { + if (reduceAttrs.operation != Algorithm::ReduceMax && reduceAttrs.operation != Algorithm::ReduceMin && + reduceAttrs.operation != Algorithm::ReduceSum && reduceAttrs.operation != Algorithm::ReduceProd && reduceAttrs.operation != Algorithm::ReduceMean) { - DEBUG_LOG("Unknown reduce algorithm passed into AclReduceExecutor: ", static_cast(reduceAttrs.operation)); - return false; - } + DEBUG_LOG("Unknown reduce algorithm passed into AclReduceExecutor: ", static_cast(reduceAttrs.operation)); + return false; + } this->reduceAttrs = reduceAttrs; @@ -46,10 +49,14 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, changeLayoutToNH_C({&srcShape, &dstShape}); } - TensorInfo srcTensorInfo = TensorInfo(srcShape, 1, - precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo dstTensorInfo = TensorInfo(dstShape, 1, - precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); + TensorInfo srcTensorInfo = TensorInfo(srcShape, + 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[0])); + TensorInfo dstTensorInfo = TensorInfo(dstShape, + 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(dstDescs[0])); srcTensor.allocator()->init(srcTensorInfo); dstTensor.allocator()->init(dstTensorInfo); @@ -57,54 +64,69 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, std::function(void)> exec_func; std::vector castedAxes; for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { - int axis = axisCast(reduceAttrs.axes[i], srcDims.size(), hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); - if (hasSrcNspcLayout && axis == -1) return false; + int axis = + axisCast(reduceAttrs.axes[i], srcDims.size(), hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); + if (hasSrcNspcLayout && axis == -1) + return false; castedAxes.push_back(axis); } switch (reduceAttrs.operation) { - case Algorithm::ReduceMean: { - for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { - auto pos = axisCast(i, reduceAttrs.axes.size()); - axesMean.set(pos, castedAxes[i]); - } - Status reduceMeanStatus = NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo); - if (!reduceMeanStatus) { - DEBUG_LOG("NEReduceMean validation failed: ", reduceMeanStatus.error_description()); - return false; - } - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensor, axesMean, this->reduceAttrs.keepDims, &dstTensor); - return acl_op; - }; - break; + case Algorithm::ReduceMean: { + for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { + auto pos = axisCast(i, reduceAttrs.axes.size()); + axesMean.set(pos, castedAxes[i]); } - case Algorithm::ReduceMax: - case Algorithm::ReduceMin: - case Algorithm::ReduceSum: - case Algorithm::ReduceProd: { - Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, castedAxes[0], - getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims); - if (!reductionOperationStatus) { - DEBUG_LOG("NEReductionOperation validation with indices failed: ", reductionOperationStatus.error_description()); - return false; - } - exec_func = [this, castedAxes]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensor, &dstTensor, castedAxes[0], - getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims); - return acl_op; - }; - break; + Status reduceMeanStatus = + NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo); + if (!reduceMeanStatus) { + DEBUG_LOG("NEReduceMean validation failed: ", reduceMeanStatus.error_description()); + return false; } - default: - OPENVINO_THROW("Unsupported operation type for ACL Reduce executor: ", static_cast(reduceAttrs.operation)); + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensor, axesMean, this->reduceAttrs.keepDims, &dstTensor); + return acl_op; + }; + break; + } + case Algorithm::ReduceMax: + case Algorithm::ReduceMin: + case Algorithm::ReduceSum: + case Algorithm::ReduceProd: { + Status reductionOperationStatus = + NEReductionOperation::validate(&srcTensorInfo, + &dstTensorInfo, + castedAxes[0], + getAclReductionOperationByAlgorithm(reduceAttrs.operation), + reduceAttrs.keepDims); + if (!reductionOperationStatus) { + DEBUG_LOG("NEReductionOperation validation with indices failed: ", + reductionOperationStatus.error_description()); + return false; + } + exec_func = [this, castedAxes]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensor, + &dstTensor, + castedAxes[0], + getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), + this->reduceAttrs.keepDims); + return acl_op; + }; + break; + } + default: + OPENVINO_THROW("Unsupported operation type for ACL Reduce executor: ", static_cast(reduceAttrs.operation)); } - configureThreadSafe([&] { ifunc = exec_func(); }); + configureThreadSafe([&] { + ifunc = exec_func(); + }); return true; } -void AclReduceExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void AclReduceExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); @@ -114,5 +136,5 @@ void AclReduceExecutor::exec(const std::vector& src, const std::vect dstTensor.allocator()->free(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp index 69bf6062918963..a121868bf80ba3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp @@ -20,10 +20,10 @@ class AclReduceExecutor : public ReduceExecutor { bool init(const ReduceAttrs& reduceAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -46,33 +46,38 @@ class AclReduceExecutorBuilder : public ReduceExecutorBuilder { const std::vector& dstDescs) const override { if (reduceAttrs.operation == Algorithm::ReduceMean) { if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() || - (srcDescs[0]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f16)) { + (srcDescs[0]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f16)) { DEBUG_LOG("NEReduceMean does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } } else { if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() || - (srcDescs[0]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f16 && - srcDescs[0]->getPrecision() != ov::element::i32)) { + (srcDescs[0]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f16 && + srcDescs[0]->getPrecision() != ov::element::i32)) { DEBUG_LOG("NEReductionOperation does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } } if (srcDescs[0]->getShape().getRank() >= arm_compute::MAX_DIMS) { - DEBUG_LOG("ACL supports ", arm_compute::MAX_DIMS, - " dimensions maximum. src[0] shape rank is ", srcDescs[0]->getShape().getRank()); + DEBUG_LOG("ACL supports ", + arm_compute::MAX_DIMS, + " dimensions maximum. src[0] shape rank is ", + srcDescs[0]->getShape().getRank()); return false; } auto srcShapeRank = srcDescs[0]->getShape().getRank(); bool hasSrcNspcLayout = srcDescs[0]->hasLayoutType(LayoutType::nspc); for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { - int axis = axisCast(reduceAttrs.axes[i], srcShapeRank, hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); + int axis = + axisCast(reduceAttrs.axes[i], srcShapeRank, hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); if (axis == -1) { DEBUG_LOG("Layout conversion to NHWC has failed"); return false; @@ -82,14 +87,12 @@ class AclReduceExecutorBuilder : public ReduceExecutorBuilder { return false; } } - if ((reduceAttrs.operation == Algorithm::ReduceSum || - reduceAttrs.operation == Algorithm::ReduceMax || - reduceAttrs.operation == Algorithm::ReduceMin || - reduceAttrs.operation == Algorithm::ReduceProd) && - reduceAttrs.axes.size() != 1) { - DEBUG_LOG("ACL supports single axes reduce only. Number of axes: ", reduceAttrs.axes.size()); - return false; - } + if ((reduceAttrs.operation == Algorithm::ReduceSum || reduceAttrs.operation == Algorithm::ReduceMax || + reduceAttrs.operation == Algorithm::ReduceMin || reduceAttrs.operation == Algorithm::ReduceProd) && + reduceAttrs.axes.size() != 1) { + DEBUG_LOG("ACL supports single axes reduce only. Number of axes: ", reduceAttrs.axes.size()); + return false; + } return true; } @@ -99,5 +102,5 @@ class AclReduceExecutorBuilder : public ReduceExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp index 801e50831b3bb1..dd16b333cb6b32 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp @@ -3,12 +3,13 @@ // #include "acl_transpose.hpp" + #include "acl_utils.hpp" -bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto inputOrder = transposeParams.permuteParams.order; if (inputOrder.empty()) { inputOrder.resize(srcDescs[0]->getShape().getRank()); @@ -24,7 +25,7 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar }; auto srcDims = changeLayoutToNhwc(srcDescs[0]->getShape().getStaticDims()); auto dstDims = changeLayoutToNhwc(dstDescs[0]->getShape().getStaticDims()); - for (int i = inputOrder.size() - 1; i >= 0 ; --i) { + for (int i = inputOrder.size() - 1; i >= 0; --i) { auto it = find(srcDims.rbegin(), srcDims.rend(), dstDims[i]); int index = it - srcDims.rbegin(); vec.push_back(index); @@ -46,10 +47,12 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar if (srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc)) { changeLayoutToNH_C({&srcDims, &dstDims}); } - auto srcTensorInfo = arm_compute::TensorInfo(srcDims, 1, + auto srcTensorInfo = arm_compute::TensorInfo(srcDims, + 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - auto dstTensorInfo = arm_compute::TensorInfo(dstDims, 1, + auto dstTensorInfo = arm_compute::TensorInfo(dstDims, + 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); arm_compute::Status status = arm_compute::NEPermute::validate(&srcTensorInfo, &dstTensorInfo, order); @@ -61,11 +64,13 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar dstTensor.allocator()->init(dstTensorInfo); acl_permute = std::make_unique(); - configureThreadSafe([&] { acl_permute->configure(&srcTensor, &dstTensor, order); }); + configureThreadSafe([&] { + acl_permute->configure(&srcTensor, &dstTensor, order); + }); return true; } -void ov::intel_cpu::ACLTransposeExecutor::exec(const std::vector &src, const std::vector &dst) { +void ov::intel_cpu::ACLTransposeExecutor::exec(const std::vector& src, const std::vector& dst) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp index 02a190597531ea..c6765aa1ff25f0 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp @@ -4,10 +4,9 @@ #pragma once -#include "nodes/executors/transpose.hpp" - -#include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/Tensor.h" +#include "nodes/executors/transpose.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -20,9 +19,12 @@ class ACLTransposeExecutor : public TransposeExecutor { bool init(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return impl_desc_type::acl; } + impl_desc_type implType() const override { + return impl_desc_type::acl; + } + private: arm_compute::Tensor srcTensor, dstTensor; std::unique_ptr acl_permute; @@ -33,13 +35,13 @@ class ACLTransposeExecutorBuilder : public TransposeExecutorBuilder { bool isSupported(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs) const override { - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { DEBUG_LOG("NEPermute does not support layout:", - " src: ", srcDescs[0]->serializeFormat(), - " dst: ", dstDescs[0]->serializeFormat()); + " src: ", + srcDescs[0]->serializeFormat(), + " dst: ", + dstDescs[0]->serializeFormat()); return false; } if (srcDescs[0]->getShape().getRank() > 4) { @@ -59,5 +61,5 @@ class ACLTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp index df57d29f4a44ec..6c3799da70bfda 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp @@ -3,6 +3,7 @@ // #include "acl_utils.hpp" + #include "support/Mutex.h" #include "utils/debug_capabilities.h" @@ -18,55 +19,55 @@ void configureThreadSafe(const std::function& config) { arm_compute::ActivationLayerInfo getActivationLayerInfo(Algorithm algorithm, float alpha = 0.0, - float beta = 0.0, + float beta = 0.0, float gamma = 0.0) { switch (algorithm) { - case Algorithm::EltwiseRelu: - if (alpha == 0) { - return arm_compute::ActivationLayerInfo::ActivationFunction::RELU; - } else { - return {arm_compute::ActivationLayerInfo::ActivationFunction::LEAKY_RELU, alpha}; - } - case Algorithm::EltwiseGeluErf: - return arm_compute::ActivationLayerInfo::ActivationFunction::GELU; - case Algorithm::EltwiseElu: - return {arm_compute::ActivationLayerInfo::ActivationFunction::ELU, alpha}; - case Algorithm::EltwiseTanh: - return {arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}; - case Algorithm::EltwiseSigmoid: - return arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC; - case Algorithm::EltwiseSqrt: - return arm_compute::ActivationLayerInfo::ActivationFunction::SQRT; - case Algorithm::EltwiseSoftRelu: - return arm_compute::ActivationLayerInfo::ActivationFunction::SOFT_RELU; - case Algorithm::EltwiseClamp: - return {arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, beta, alpha}; - case Algorithm::EltwiseSwish: - return {arm_compute::ActivationLayerInfo::ActivationFunction::SWISH, alpha}; - case Algorithm::EltwiseHswish: - return arm_compute::ActivationLayerInfo::ActivationFunction::HARD_SWISH; - default: - OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", static_cast(algorithm)); + case Algorithm::EltwiseRelu: + if (alpha == 0) { + return arm_compute::ActivationLayerInfo::ActivationFunction::RELU; + } else { + return {arm_compute::ActivationLayerInfo::ActivationFunction::LEAKY_RELU, alpha}; + } + case Algorithm::EltwiseGeluErf: + return arm_compute::ActivationLayerInfo::ActivationFunction::GELU; + case Algorithm::EltwiseElu: + return {arm_compute::ActivationLayerInfo::ActivationFunction::ELU, alpha}; + case Algorithm::EltwiseTanh: + return {arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}; + case Algorithm::EltwiseSigmoid: + return arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC; + case Algorithm::EltwiseSqrt: + return arm_compute::ActivationLayerInfo::ActivationFunction::SQRT; + case Algorithm::EltwiseSoftRelu: + return arm_compute::ActivationLayerInfo::ActivationFunction::SOFT_RELU; + case Algorithm::EltwiseClamp: + return {arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, beta, alpha}; + case Algorithm::EltwiseSwish: + return {arm_compute::ActivationLayerInfo::ActivationFunction::SWISH, alpha}; + case Algorithm::EltwiseHswish: + return arm_compute::ActivationLayerInfo::ActivationFunction::HARD_SWISH; + default: + OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", static_cast(algorithm)); } } bool checkActivationLayerInfo(Algorithm algorithm) { switch (algorithm) { - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - return true; - default: - return false; + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + return true; + default: + return false; } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp index a3d151192e601b..1d30736353b878 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp @@ -3,19 +3,19 @@ // #pragma once -#include "memory_desc/cpu_memory_desc.h" #include "arm_compute/core/Types.h" #include "cpu_types.h" +#include "memory_desc/cpu_memory_desc.h" namespace ov { namespace intel_cpu { /** -* @brief ACL supports arm_compute::MAX_DIMS maximum. The method squashes the last -* dimensions in order to comply with this limitation -* @param dims vector of dimensions to squash -* @return vector of dimensions that complies to ACL -*/ + * @brief ACL supports arm_compute::MAX_DIMS maximum. The method squashes the last + * dimensions in order to comply with this limitation + * @param dims vector of dimensions to squash + * @return vector of dimensions that complies to ACL + */ inline VectorDims collapse_dims_to_max_rank(VectorDims dims, size_t max_num_shape = arm_compute::MAX_DIMS) { VectorDims result_dims(max_num_shape - 1); if (dims.size() >= max_num_shape) { @@ -32,17 +32,23 @@ inline VectorDims collapse_dims_to_max_rank(VectorDims dims, size_t max_num_shap } /** -* @brief ACL handles NH_C specifically, it thinks it is NC_W, so we need to change layout manually: -* e.g. NCHW (0, 1, 2, 3) -> NHWC (0, 2, 3, 1) -* @param _listDims list of dimensions to convert -* @return none -*/ - -inline void changeLayoutToNH_C(const std::vector &_listDims) { - auto mover = [](arm_compute::TensorShape &_shape) { - if (_shape.num_dimensions() > 4) { std::swap(_shape[2], _shape[3]); } - if (_shape.num_dimensions() > 3) { std::swap(_shape[1], _shape[2]); } - if (_shape.num_dimensions() > 2) { std::swap(_shape[0], _shape[1]); } + * @brief ACL handles NH_C specifically, it thinks it is NC_W, so we need to change layout manually: + * e.g. NCHW (0, 1, 2, 3) -> NHWC (0, 2, 3, 1) + * @param _listDims list of dimensions to convert + * @return none + */ + +inline void changeLayoutToNH_C(const std::vector& _listDims) { + auto mover = [](arm_compute::TensorShape& _shape) { + if (_shape.num_dimensions() > 4) { + std::swap(_shape[2], _shape[3]); + } + if (_shape.num_dimensions() > 3) { + std::swap(_shape[1], _shape[2]); + } + if (_shape.num_dimensions() > 2) { + std::swap(_shape[0], _shape[1]); + } }; for (auto& dims : _listDims) { @@ -51,10 +57,10 @@ inline void changeLayoutToNH_C(const std::vector &_li } /** -* @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL -* @param dims vector of dimensions to convert -* @return ComputeLibrary TensorShape object -*/ + * @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL + * @param dims vector of dimensions to convert + * @return ComputeLibrary TensorShape object + */ inline arm_compute::TensorShape shapeCast(const VectorDims& dims) { arm_compute::TensorShape tensorShape; for (std::size_t i = 0; i < dims.size(); ++i) { @@ -67,20 +73,18 @@ inline arm_compute::TensorShape shapeCast(const VectorDims& dims) { return tensorShape; } -enum ACLAxisCastMode { - NO_LAYOUT_CONVERSION, - NHWC_TO_NCHW, - NCHW_TO_NHWC -}; +enum ACLAxisCastMode { NO_LAYOUT_CONVERSION, NHWC_TO_NCHW, NCHW_TO_NHWC }; /** -* @brief Return reverted axis used in ACL. If axis cast mode is -* @param axis axis that needs to be converted -* @param shapeSize size of the shape, which axis needs to be converted -* @param axisCastMode specifies whether layout conversion is required or not -* @return reverted axis -*/ -inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxisCastMode axisCastMode = NO_LAYOUT_CONVERSION) { + * @brief Return reverted axis used in ACL. If axis cast mode is + * @param axis axis that needs to be converted + * @param shapeSize size of the shape, which axis needs to be converted + * @param axisCastMode specifies whether layout conversion is required or not + * @return reverted axis + */ +inline int axisCast(const std::size_t axis, + const std::size_t shapeSize, + ACLAxisCastMode axisCastMode = NO_LAYOUT_CONVERSION) { // CWHN (reverted NHWC) (0, 1, 2, 3) into WHCN (reverted NCHW) (1, 2, 0, 3) static const std::array nhwcToNchw = {1, 2, 0, 3}; // WHCN (reverted NCHW) (0, 1, 2, 3) into CWHN (reverted NHWC) (2, 0, 1, 3) @@ -92,80 +96,100 @@ inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxis size_t revertedAxis = shapeSize - axis - 1; switch (axisCastMode) { - case NO_LAYOUT_CONVERSION: - return revertedAxis; - case NHWC_TO_NCHW: - if (shapeSize == 4) return nhwcToNchw[revertedAxis]; - if (shapeSize == 5) return ndhwcToNcdhw[revertedAxis]; - case NCHW_TO_NHWC: - if (shapeSize == 4) return nchwToNhwc[revertedAxis]; - if (shapeSize == 5) return ncdhwToNdhwc[revertedAxis]; - default: - return -1; + case NO_LAYOUT_CONVERSION: + return revertedAxis; + case NHWC_TO_NCHW: + if (shapeSize == 4) + return nhwcToNchw[revertedAxis]; + if (shapeSize == 5) + return ndhwcToNcdhw[revertedAxis]; + case NCHW_TO_NHWC: + if (shapeSize == 4) + return nchwToNhwc[revertedAxis]; + if (shapeSize == 5) + return ncdhwToNdhwc[revertedAxis]; + default: + return -1; } } /** -* @brief Return ComputeLibrary DataType that corresponds to the given precision -* @param precision precision to be converted -* @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType -*/ + * @brief Return ComputeLibrary DataType that corresponds to the given precision + * @param precision precision to be converted + * @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType + */ inline arm_compute::DataType precisionToAclDataType(ov::element::Type precision) { switch (precision) { - case ov::element::i8: return arm_compute::DataType::S8; - case ov::element::u8: return arm_compute::DataType::U8; - case ov::element::i16: return arm_compute::DataType::S16; - case ov::element::u16: return arm_compute::DataType::U16; - case ov::element::i32: return arm_compute::DataType::S32; - case ov::element::u32: return arm_compute::DataType::U32; - case ov::element::f16: return arm_compute::DataType::F16; - case ov::element::f32: return arm_compute::DataType::F32; - case ov::element::f64: return arm_compute::DataType::F64; - case ov::element::i64: return arm_compute::DataType::S64; - case ov::element::bf16: return arm_compute::DataType::BFLOAT16; - default: return arm_compute::DataType::UNKNOWN; + case ov::element::i8: + return arm_compute::DataType::S8; + case ov::element::u8: + return arm_compute::DataType::U8; + case ov::element::i16: + return arm_compute::DataType::S16; + case ov::element::u16: + return arm_compute::DataType::U16; + case ov::element::i32: + return arm_compute::DataType::S32; + case ov::element::u32: + return arm_compute::DataType::U32; + case ov::element::f16: + return arm_compute::DataType::F16; + case ov::element::f32: + return arm_compute::DataType::F32; + case ov::element::f64: + return arm_compute::DataType::F64; + case ov::element::i64: + return arm_compute::DataType::S64; + case ov::element::bf16: + return arm_compute::DataType::BFLOAT16; + default: + return arm_compute::DataType::UNKNOWN; } } /** -* @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout -* @param desc MemoryDecs from which layout is retrieved -* @param treatAs4D the flag that treats MemoryDecs as 4D shape -* @return ComputeLibrary DataLayout or UNKNOWN if MemoryDecs layout is not mapped to DataLayout -*/ + * @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout + * @param desc MemoryDecs from which layout is retrieved + * @param treatAs4D the flag that treats MemoryDecs as 4D shape + * @return ComputeLibrary DataLayout or UNKNOWN if MemoryDecs layout is not mapped to DataLayout + */ inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc) { if (desc->hasLayoutType(LayoutType::ncsp)) { - if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NCHW; - if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW; + if (desc->getShape().getRank() <= 4) + return arm_compute::DataLayout::NCHW; + if (desc->getShape().getRank() == 5) + return arm_compute::DataLayout::NCDHW; } else if (desc->hasLayoutType(LayoutType::nspc)) { - if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NHWC; - if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NDHWC; + if (desc->getShape().getRank() <= 4) + return arm_compute::DataLayout::NHWC; + if (desc->getShape().getRank() == 5) + return arm_compute::DataLayout::NDHWC; } return arm_compute::DataLayout::UNKNOWN; } /** -* @brief run thread-safe configure for ComputeLibrary configuration function. -* Arm Compute Library 23.08 does not officially support thread-safe configure() calls. -* For example, calling configure for Eltwise operations from multiple streams leads to a data race and seg fault. -* @param config ComputeLibrary configuration function -*/ + * @brief run thread-safe configure for ComputeLibrary configuration function. + * Arm Compute Library 23.08 does not officially support thread-safe configure() calls. + * For example, calling configure for Eltwise operations from multiple streams leads to a data race and seg fault. + * @param config ComputeLibrary configuration function + */ void configureThreadSafe(const std::function& config); /** -* @brief get ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. -* @param algorithm activation function of openvino representation -* @param alpha alpha coefficient for algorithm -* @param beta beta coefficient for algorithm -* @param gamma gamma coefficient for algorithm -*/ + * @brief get ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. + * @param algorithm activation function of openvino representation + * @param alpha alpha coefficient for algorithm + * @param beta beta coefficient for algorithm + * @param gamma gamma coefficient for algorithm + */ arm_compute::ActivationLayerInfo getActivationLayerInfo(Algorithm algorithm, float alpha, float beta, float gamma); /** -* @brief check ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. -* @param algorithm activation function of openvino representation -*/ + * @brief check ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. + * @param algorithm activation function of openvino representation + */ bool checkActivationLayerInfo(Algorithm algorithm); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/common_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/common_utils.hpp new file mode 100644 index 00000000000000..614caead1a39b1 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/common/common_utils.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +// @file common_utils.hpp +// Contains utility methods used by all executors +// + +#pragma once + +#include + +#include "nodes/executors/memory_arguments.hpp" +#include "utils/cpp/maybe_unused.hpp" +#include "utils/cpu_utils.hpp" + +namespace ov { +namespace intel_cpu { + +OV_CPU_MAYBE_UNUSED_FUNCTION static std::vector getDeQuantizedScales(const MemoryArgs& memory) { + if (!memory.count(ARG_DST_DEQ_SCALE)) + return {}; + + auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE); + + auto scalesData = static_cast(scalesMemory->getData()); + + if (!scalesData) + return {}; + + auto dstShape = memory.at(ARG_DST)->getShape(); + auto dqScalesShape = scalesMemory->getShape(); + + auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size()); + + auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies()); + + std::vector DQScales(scaleSize, 1.0); + + OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize, + "set invalid scales size , DQScales vector size: ", + DQScales.size(), + ", scale data size: ", + scaleSize); + + // @todo do we really need to broadcast dq scales and then resize them back? + if (scaleSize > DQScales.size()) + DQScales.resize(scaleSize, DQScales[0]); + if (1 == scaleSize) { + std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) { + return (scalesData[0] * val); + }); + } else { + for (size_t i = 0; i < DQScales.size(); i++) { + DQScales[i] *= scalesData[i]; + } + } + if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) { + return (val == DQScales[0]); + })) + DQScales.resize(1); + + return DQScales; +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp index 2bba0f5e73c0fe..de65176fb72235 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp @@ -3,6 +3,7 @@ // #include "ref_convert.hpp" + #include "nodes/common/cpu_convert.h" namespace ov { @@ -13,9 +14,9 @@ bool CommonConvertExecutor::isSupported(ov::element::Type srcPrc, ov::element::T } bool CommonConvertExecutor::init(const ConvertParams& convertParams, - const MemoryDescPtr& srcDesc, - const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr& attr) { + const MemoryDescPtr& srcDesc, + const MemoryDescPtr& dstDesc, + const dnnl::primitive_attr& attr) { commonConvertParams = convertParams; return true; } @@ -32,5 +33,5 @@ void CommonConvertExecutor::exec(const std::vector& src, const std:: commonConvertParams.size); } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp index 337d377f3b3339..4bc3a709d2bcd2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp @@ -15,9 +15,11 @@ class CommonConvertExecutor : public ConvertExecutor { bool init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return implDescType; }; + impl_desc_type implType() const override { + return implDescType; + }; static bool isSupported(ov::element::Type srcPrc, ov::element::Type dstPrc); protected: @@ -26,7 +28,6 @@ class CommonConvertExecutor : public ConvertExecutor { const ExecutorContext::CPtr convertContext; }; - class CommonConvertExecutorBuilder : public ConvertExecutorBuilder { public: ~CommonConvertExecutorBuilder() = default; @@ -40,5 +41,5 @@ class CommonConvertExecutorBuilder : public ConvertExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp index 0e1d43b48f6224..dd0cea3d238a4e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp @@ -3,6 +3,7 @@ // #include "ref_opt_transpose.hpp" + #include "openvino/core/parallel.hpp" namespace ov { @@ -26,21 +27,15 @@ void transpose_to_0312(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& dst parallel_for3d(MB, DIM1, DIM2, [&](const int n, const int dim1, const int dim2) { for (int dim3 = 0; dim3 < DIM3; ++dim3) { - const int src_off = n * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; - const int dst_off = n * DIM1 * DIM2 * DIM3 + - dim3 * DIM1 * DIM2 + - dim1 * DIM2 + - dim2; + const int src_off = n * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; + const int dst_off = n * DIM1 * DIM2 * DIM3 + dim3 * DIM1 * DIM2 + dim1 * DIM2 + dim2; dst_data[dst_off] = src_data[src_off]; } }); } -template +template void transpose_to_04123(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& dstMemPtr) { const auto src_data = srcMemPtr->getDataAs(); auto dst_data = dstMemPtr->getDataAs(); @@ -52,23 +47,17 @@ void transpose_to_04123(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& ds parallel_for4d(MB, DIM1, DIM2, DIM3, [&](const int n, const int dim1, const int dim2, const int dim3) { for (int dim4 = 0; dim4 < DIM4; ++dim4) { - const int src_off = n * DIM1 * DIM2 * DIM3 * DIM4 + - dim1 * DIM2 * DIM3 * DIM4 + - dim2 * DIM3 * DIM4 + - dim3 * DIM4 + - dim4; - const int dst_off = n * DIM1 * DIM2 * DIM3 * DIM4 + - dim4 * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; + const int src_off = + n * DIM1 * DIM2 * DIM3 * DIM4 + dim1 * DIM2 * DIM3 * DIM4 + dim2 * DIM3 * DIM4 + dim3 * DIM4 + dim4; + const int dst_off = + n * DIM1 * DIM2 * DIM3 * DIM4 + dim4 * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; dst_data[dst_off] = src_data[src_off]; } }); } -template +template void transpose_to_051234(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& dstMemPtr) { const auto src_data = srcMemPtr->getDataAs(); auto dst_data = dstMemPtr->getDataAs(); @@ -79,61 +68,61 @@ void transpose_to_051234(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& d const int DIM4 = srcMemPtr->getStaticDims()[4]; const int DIM5 = srcMemPtr->getStaticDims()[5]; - parallel_for5d(MB, DIM1, DIM2, DIM3, DIM4, [&](const int n, const int dim1, const int dim2, const int dim3, const int dim4) { - for (int dim5 = 0; dim5 < DIM5; ++dim5) { - const int src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 + - dim1 * DIM2 * DIM3 * DIM4 * DIM5 + - dim2 * DIM3 * DIM4 * DIM5 + - dim3 * DIM4 * DIM5 + - dim4 * DIM5 + - dim5; - const int dst_off = n * DIM5 * DIM1 * DIM2 * DIM3 * DIM4 + - dim5 * DIM1 * DIM2 * DIM3 * DIM4 + - dim1 * DIM2 * DIM3 * DIM4 + - dim2 * DIM3 * DIM4 + - dim3 * DIM4 + - dim4; - - dst_data[dst_off] = src_data[src_off]; - } - }); + parallel_for5d(MB, + DIM1, + DIM2, + DIM3, + DIM4, + [&](const int n, const int dim1, const int dim2, const int dim3, const int dim4) { + for (int dim5 = 0; dim5 < DIM5; ++dim5) { + const int src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 + dim1 * DIM2 * DIM3 * DIM4 * DIM5 + + dim2 * DIM3 * DIM4 * DIM5 + dim3 * DIM4 * DIM5 + dim4 * DIM5 + dim5; + const int dst_off = n * DIM5 * DIM1 * DIM2 * DIM3 * DIM4 + dim5 * DIM1 * DIM2 * DIM3 * DIM4 + + dim1 * DIM2 * DIM3 * DIM4 + dim2 * DIM3 * DIM4 + dim3 * DIM4 + dim4; + + dst_data[dst_off] = src_data[src_off]; + } + }); } -template +template struct TransposeOptimizedEmitter { void operator()(TransposeContext& ctx) { switch (ctx.srcMemPtr->getStaticDims().size()) { - case 4: - transpose_to_0312(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); - break; - case 5: - transpose_to_04123(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); - break; - case 6: - transpose_to_051234(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); - break; - default: - OPENVINO_THROW("Transpose supports optimized execution with only 4D, 5D and 6D shapes"); + case 4: + transpose_to_0312(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); + break; + case 5: + transpose_to_04123(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); + break; + case 6: + transpose_to_051234(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); + break; + default: + OPENVINO_THROW("Transpose supports optimized execution with only 4D, 5D and 6D shapes"); } } }; -} // namespace +} // namespace void RefOptimizedTransposeExecutor::exec(const std::vector& src, const std::vector& dst) { const size_t dataSize = src[0]->getDesc().getPrecision().size(); const int MB = src[0]->getStaticDims()[0]; TransposeContext ctx = {src[0], dst[0], MB}; - OV_SWITCH(intel_cpu, TransposeOptimizedEmitter, ctx, dataSize, + OV_SWITCH(intel_cpu, + TransposeOptimizedEmitter, + ctx, + dataSize, OV_CASE(1u, element_type_traits::value_type), OV_CASE(2u, element_type_traits::value_type), OV_CASE(4u, element_type_traits::value_type)); } -bool RefOptimizedTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool RefOptimizedTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp index be420bfb009e5a..65da099caa0f33 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp @@ -13,12 +13,14 @@ class RefOptimizedTransposeExecutor : public TransposeExecutor { public: using TransposeExecutor::TransposeExecutor; - bool init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) override; - void exec(const std::vector &src, const std::vector &dst) override; - impl_desc_type implType() const override { return impl_desc_type::ref; } + bool init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) override; + void exec(const std::vector& src, const std::vector& dst) override; + impl_desc_type implType() const override { + return impl_desc_type::ref; + } }; class RefOptimizedTransposeExecutorBuilder : public TransposeExecutorBuilder { @@ -27,12 +29,13 @@ class RefOptimizedTransposeExecutorBuilder : public TransposeExecutorBuilder { const std::vector& srcDescs, const std::vector& dstDescs) const override { static const std::vector> optimizedOrders = { - std::vector{0, 3, 1, 2}, - std::vector{0, 4, 1, 2, 3}, - std::vector{0, 5, 1, 2, 3, 4}, + std::vector{0, 3, 1, 2}, + std::vector{0, 4, 1, 2, 3}, + std::vector{0, 5, 1, 2, 3, 4}, }; if (srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - std::find(optimizedOrders.begin(), optimizedOrders.end(), transposeParams.permuteParams.order) != optimizedOrders.end()) { + std::find(optimizedOrders.begin(), optimizedOrders.end(), transposeParams.permuteParams.order) != + optimizedOrders.end()) { return true; } DEBUG_LOG("RefOptimizedTransposeExecutor is not supported, because passed order is not optimized"); @@ -44,5 +47,5 @@ class RefOptimizedTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp index 8db8798ef8eaff..1716f008027fe9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp @@ -3,8 +3,9 @@ // #include "ref_transpose.hpp" -#include "openvino/core/parallel.hpp" + #include "nodes/common/cpu_memcpy.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -27,7 +28,10 @@ static inline void parallel_step(size_t nDims, const VectorDims& dims, VectorDim } } -void RefTransposeExecutor::referenceExecute(const uint8_t* src_data, uint8_t* dst_data, jit_permute_config_params jcp, const int mb) { +void RefTransposeExecutor::referenceExecute(const uint8_t* src_data, + uint8_t* dst_data, + jit_permute_config_params jcp, + const int mb) { VectorDims dst_dims = jcp.dst_block_dims; const VectorDims dst_strides = jcp.dst_strides; const VectorDims src_strides = jcp.src_strides; @@ -70,13 +74,13 @@ void RefTransposeExecutor::exec(const std::vector& src, const std::v referenceExecute(src_data, dst_data, jcp, MB); } -bool RefTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool RefTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { jcp = TransposeExecutor::prepareParams(transposeParams.permuteParams); return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp index 206d610368a9df..00c1602c0bd119 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp @@ -11,13 +11,19 @@ namespace intel_cpu { class RefTransposeExecutor : public TransposeExecutor { public: using TransposeExecutor::TransposeExecutor; - static void referenceExecute(const uint8_t* src_data, uint8_t* dst_data, jit_permute_config_params jcp, const int mb); - bool init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) override; - void exec(const std::vector &src, const std::vector &dst) override; - impl_desc_type implType() const override { return impl_desc_type::ref; } + static void referenceExecute(const uint8_t* src_data, + uint8_t* dst_data, + jit_permute_config_params jcp, + const int mb); + bool init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) override; + void exec(const std::vector& src, const std::vector& dst) override; + impl_desc_type implType() const override { + return impl_desc_type::ref; + } + private: jit_permute_config_params jcp; }; @@ -35,5 +41,5 @@ class RefTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/convert.cpp index c8d7ce8addaf22..32141d53b10ee5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert.cpp @@ -4,4 +4,5 @@ #include "convert.hpp" -ov::intel_cpu::ConvertExecutor::ConvertExecutor(const ov::intel_cpu::ExecutorContext::CPtr context) : convertContext(context) {} \ No newline at end of file +ov::intel_cpu::ConvertExecutor::ConvertExecutor(const ov::intel_cpu::ExecutorContext::CPtr context) + : convertContext(context) {} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert.hpp b/src/plugins/intel_cpu/src/nodes/executors/convert.hpp index ce766663a0b653..dcb0bdde2ce219 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -24,8 +24,9 @@ class ConvertExecutor : public Executor { virtual bool init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; virtual ~ConvertExecutor() = default; + protected: ConvertParams convertParams; const ExecutorContext::CPtr convertContext; @@ -45,5 +46,5 @@ class ConvertExecutorBuilder { using ConvertExecutorBuilderPtr = std::shared_ptr; using ConvertExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp index 504c310ca15124..5375bd21166cc4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp @@ -9,9 +9,8 @@ namespace intel_cpu { const std::vector& getConvertExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) + OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared())}; return descs; } @@ -45,5 +44,5 @@ ConvertExecutorPtr ConvertExecutorFactory::makeExecutor(const ConvertParams& con OPENVINO_THROW("Supported executor is not found"); } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp index a7ed05ceb634e4..9ea47f916d859f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp @@ -4,17 +4,15 @@ #pragma once -#include "executor.hpp" - #include "convert.hpp" +#include "executor.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_convert.hpp" +# include "acl/acl_convert.hpp" #endif +#include "common/primitive_cache.hpp" #include "common/ref_convert.hpp" - #include "onednn/iml_type_mapper.h" -#include "common/primitive_cache.hpp" namespace ov { namespace intel_cpu { @@ -31,7 +29,8 @@ class ConvertExecutorFactory : public ExecutorFactoryLegacy { ConvertExecutorFactory(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getConvertExecutorsList()) { if (desc.builder->isSupported(convertParams, srcDesc, dstDesc)) { supportedDescs.push_back(desc); @@ -43,7 +42,7 @@ class ConvertExecutorFactory : public ExecutorFactoryLegacy { virtual ConvertExecutorPtr makeExecutor(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr); + const dnnl::primitive_attr& attr); private: std::vector supportedDescs; @@ -53,5 +52,5 @@ class ConvertExecutorFactory : public ExecutorFactoryLegacy { using ConvertExecutorFactoryPtr = std::shared_ptr; using ConvertExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp index 26ae6ace59631b..cba22a25c751d6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp @@ -4,25 +4,26 @@ #pragma once -#define UNSUPPORTED_SPARSE_WEIGHTS " sparse weights are not supported" -#define UNSUPPORTED_WEIGHTS_DECOMPRESSION " weights decompression is not supported" -#define UNSUPPORTED_POST_OPS " post ops are not supported" -#define UNSUPPORTED_NUMBER_OF_POSTOPS " the number of post ops is not supported" -#define UNSUPPORTED_TYPE_OF_POSTOPS " the type of post ops is not supported" -#define UNSUPPORTED_SRC_PRECISIONS " unsupported src precisions" -#define UNSUPPORTED_WEI_PRECISIONS " unsupported wei precisions" -#define UNSUPPORTED_DST_PRECISIONS " unsupported dst precisions" -#define UNSUPPORTED_ISA " unsupported isa" -#define UNSUPPORTED_SRC_RANK " unsupported src rank" -#define UNSUPPORTED_WEI_RANK " unsupported wei rank" -#define UNSUPPORTED_DST_RANK " unsupported dst rank" -#define UNSUPPORTED_DST_STRIDES " unsupported dst strides" -#define HEURISTICS_MISMATCH " heuristics mismatch" +#define UNSUPPORTED_SPARSE_WEIGHTS " sparse weights are not supported" +#define UNSUPPORTED_WEIGHTS_DECOMPRESSION " weights decompression is not supported" +#define UNSUPPORTED_POST_OPS " post ops are not supported" +#define UNSUPPORTED_NUMBER_OF_POSTOPS " the number of post ops is not supported" +#define UNSUPPORTED_TYPE_OF_POSTOPS " the type of post ops is not supported" +#define UNSUPPORTED_SRC_PRECISIONS " unsupported src precisions" +#define UNSUPPORTED_WEI_PRECISIONS " unsupported wei precisions" +#define UNSUPPORTED_DST_PRECISIONS " unsupported dst precisions" +#define UNSUPPORTED_ISA " unsupported isa" +#define UNSUPPORTED_SRC_RANK " unsupported src rank" +#define UNSUPPORTED_WEI_RANK " unsupported wei rank" +#define UNSUPPORTED_DST_RANK " unsupported dst rank" +#define UNSUPPORTED_DST_STRIDES " unsupported dst strides" +#define HEURISTICS_MISMATCH " heuristics mismatch" +#define UNSUPPORTED_PER_CHANNEL_QUANTIZATION " unsupported per-channel quantization" -#define VERIFY(condition, ...) \ - do { \ - if (!(condition)) { \ +#define VERIFY(condition, ...) \ + do { \ + if (!(condition)) { \ DEBUG_LOG(__VA_ARGS__); \ - return false; \ - } \ + return false; \ + } \ } while (0) diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp index 23e0910bd0c82c..e485815e950af4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp @@ -5,8 +5,5 @@ #include "deconv.hpp" namespace ov { -namespace intel_cpu { - - -} // namespace intel_cpu -} // namespace ov +namespace intel_cpu {} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp index c632cc0cf99ad1..11920c0ab35b49 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp @@ -34,11 +34,11 @@ class DeconvExecutor { virtual bool init(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; virtual void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) = 0; + const void* post_ops_data_) = 0; virtual ~DeconvExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -53,12 +53,14 @@ using DeconvExecutorCPtr = std::shared_ptr; class DeconvExecutorBuilder { public: ~DeconvExecutorBuilder() = default; - virtual bool isSupported(const DeconvAttrs& convAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const = 0; + virtual bool isSupported(const DeconvAttrs& convAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const = 0; virtual DeconvExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0; }; using DeconvExecutorBuilderPtr = std::shared_ptr; using DeconvExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp index f5b897c2d1b6e1..c093057e47413f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getDeconvExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp index 4c63a565aac2e0..fd114094303808 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp @@ -4,15 +4,14 @@ #pragma once -#include "executor.hpp" - #include "deconv.hpp" +#include "executor.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_deconv.hpp" +# include "acl/acl_deconv.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -29,7 +28,8 @@ class DeconvExecutorFactory : public ExecutorFactoryLegacy { DeconvExecutorFactory(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getDeconvExecutorsList()) { if (desc.builder->isSupported(deconvAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -41,7 +41,7 @@ class DeconvExecutorFactory : public ExecutorFactoryLegacy { virtual DeconvExecutorPtr makeExecutor(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const dnnl::primitive_attr& attr) { auto build = [&](const DeconvExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(deconvAttrs, srcDescs, dstDescs, attr)) { @@ -75,5 +75,5 @@ class DeconvExecutorFactory : public ExecutorFactoryLegacy { using DeconvExecutorFactoryPtr = std::shared_ptr; using DeconvExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp index a611e94f617e44..27fa7dd38d7a99 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp @@ -4,8 +4,8 @@ #pragma once -#include #include +#include namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp index 8f9d7ad0805e41..61aca683a37687 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp @@ -157,8 +157,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const ConvAttrs& attrs, one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); - DnnlPostOpsComposer - dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, {}, attrs.withBias, outputDataType); + DnnlPostOpsComposer dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, memory, outputDataType); return dnnlpoc.compose(); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp index 266e78b3d46c77..db5c8bed2e43e1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp @@ -8,12 +8,12 @@ #include #include "cpu_memory.h" -#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp" -#include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp" +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/dnnl/dnnl_aliases.hpp" +#include "nodes/executors/dnnl/dnnl_utils.hpp" #include "nodes/executors/executor.hpp" -#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/memory_arguments.hpp" +#include "post_ops.hpp" namespace ov { namespace intel_cpu { @@ -73,7 +73,7 @@ class DnnlFCExecutor : public Executor { return; } const auto newPrimMemDesc = m_primitive->scratchPadDesc(); - m_scratchPadMemory = m_context->getScratchPad(numaNodeID)->createScratchPadMem(newPrimMemDesc); + m_scratchPadMemory = m_context->getScratchPad()->createScratchPadMem(newPrimMemDesc); m_primArgs[DNNL_ARG_SCRATCHPAD] = m_scratchPadMemory->getPrimitive(); if (m_primArgs.count(DNNL_ARG_WEIGHTS)) { @@ -123,7 +123,8 @@ class DnnlFCExecutor : public Executor { if (currentPrimitive && currentPrimitive->weightsDesc()->isCompatible(*newPrimMemDesc)) return; - originalMemDesc = Primitive::makeTransposedWeightDescriptor(originalMemDesc, newPrimMemDesc, m_attrs.weightsNonTransposed); + originalMemDesc = + Primitive::makeTransposedWeightDescriptor(originalMemDesc, newPrimMemDesc, m_attrs.weightsNonTransposed); const auto weiMemory = utils::prepareWeightsMemory(originalMemDesc, newPrimMemDesc, memory, m_context, true); m_primArgs[DNNL_ARG_WEIGHTS] = weiMemory->getPrimitive(); @@ -139,13 +140,11 @@ class DnnlFCExecutor : public Executor { if (currentPrimitive && currentPrimitive->scratchPadDesc()->isCompatible(*newPrimMemDesc)) return; - m_scratchPadMemory = m_context->getScratchPad(curNumaNode)->createScratchPadMem(newPrimMemDesc); + m_scratchPadMemory = m_context->getScratchPad()->createScratchPadMem(newPrimMemDesc); m_primArgs[DNNL_ARG_SCRATCHPAD] = m_scratchPadMemory->getPrimitive(); } - void updateMemory(const PrimitivePtr currentPrimitive, - const PrimitivePtr newPrimitive, - const MemoryArgs& memory) { + void updateMemory(const PrimitivePtr currentPrimitive, const PrimitivePtr newPrimitive, const MemoryArgs& memory) { const auto& srcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(memory.at(ARG_SRC)->getDescPtr()); const auto& weiDesc = MemoryDescUtils::convertToDnnlMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); const auto& dstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(memory.at(ARG_DST)->getDescPtr()); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index fcb70d4753b2ce..52434a1eeb8461 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "nodes/executors/executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -72,9 +74,8 @@ bool DnnlFCPrimitive::Key::operator==(const Key& rhs) const { result = result && dst && rhs.dst && dst->getDnnlDesc() == rhs.dst->getDnnlDesc(); } - result = result && *attr.get() == *rhs.attr.get() && - sparseWeights == rhs.sparseWeights && - modelType == rhs.modelType; + result = + result && *attr.get() == *rhs.attr.get() && sparseWeights == rhs.sparseWeights && modelType == rhs.modelType; return result; } @@ -115,9 +116,10 @@ DnnlMemoryDescPtr DnnlFCPrimitive::makeTransposedWeightDescriptor(const DnnlMemo return srcDesc; const auto& weiDesc = srcDesc->getDnnlDesc(); - const auto reorderedWeiDesc = - dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; - const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); + auto wDims = weiDesc.get_dims(); + dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims); + + const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -140,12 +142,11 @@ bool DnnlFCPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputT return false; } -bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose) { +static bool useDynamicQuantizationImpl(size_t dqGroupSize, + const MemoryDescPtr srcDesc, + const MemoryDescPtr weightsDesc, + const MemoryArgs& memory, + bool needTranspose) { if (dqGroupSize == 0) return false; @@ -155,6 +156,9 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (srcDesc->getPrecision() != ov::element::f32) return false; + + MemoryCPtr zpPtr = + memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS) ? memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS) : nullptr; // For dynamic quantization, VNNI accumulation requires weight to be unsigned. // To support dynamic quantization with weights symmetrically quantized as i8/i4 // w/o zero-point, we will transform weight to u8/u4 weight with zp 128/8. @@ -177,11 +181,15 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (weightsDesc->getPrecision() == ov::element::u4) { int ic = weightsDesc->getShape().getStaticDims()[1]; int minGroupSize = INT_MAX; + + MemoryCPtr scalesPtr = memory.count(ARG_WEI | ARG_ATTR_SCALES) ? memory.at(ARG_WEI | ARG_ATTR_SCALES) : nullptr; + if (scalesPtr && scalesPtr->getShape().getRank() == 3) { auto scalesDims = scalesPtr->getShape().getStaticDims(); auto groupsNum = needTranspose ? scalesDims[1] : scalesDims[0]; minGroupSize = ic / groupsNum; } + if (zpPtr && zpPtr->getShape().getRank() == 3) { auto zpDims = zpPtr->getShape().getStaticDims(); int groupsNum = needTranspose ? zpDims[1] : zpDims[0]; @@ -196,11 +204,6 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, return true; } -template -static std::vector normalizeDimsTo2D(const std::vector& dims) { - return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies()), dims[dims.size() - 1]}; -} - static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, @@ -211,33 +214,30 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); const auto& originalDims = dstDesc->getShape().getMinDims(); - const auto& dims = normalizeDimsTo2D(originalDims); + const auto& dims = reshapeDownToRank<2>(originalDims); auto isINT8 = one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); - DnnlPostOpsComposer dnnlpoc(postOps, - context->getEngine(), - dims, - dims.size() - 1, - isINT8, - 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), - outputDataType); - - if (attrs.decompressionMultiplyPtr) { - auto dstPrc = attrs.decompressionMultiplyPtr->getPrecision(); + DnnlPostOpsComposer + dnnlpoc(postOps, context->getEngine(), dims, dims.size() - 1, isINT8, 1 << 0, memory, outputDataType); + + if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) { + auto dstPrc = memory.at(ARG_WEI | ARG_ATTR_SCALES)->getPrecision(); if (dstPrc != f8e8m0 || useDynamicQuantization) dstPrc = ov::element::f32; - dnnlpoc.appendDecompressionScales(attrs.decompressionMultiplyPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), !attrs.weightsNonTransposed, dstPrc); } - if (attrs.decompressionSubtractPtr) { + + if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) { auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32; - dnnlpoc.appendDecompressionZeroPoints(attrs.decompressionSubtractPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionZeroPoints(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS), + !attrs.weightsNonTransposed, + dstPrc); } + if (useDynamicQuantization) { auto wei_precision = weiDesc->getPrecision(); bool is_symmetric_weights = (wei_precision == ov::element::i8) || (wei_precision == ov::element::i4); @@ -261,7 +261,7 @@ static dnnl::memory::desc normalizeDescriptor(const dnnl::memory::desc& desc) { const auto& dims = desc.get_dims(); if (dims.size() > 2) - return desc.reshape(normalizeDimsTo2D(dims)); + return desc.reshape(reshapeDownToRank<2>(dims)); return desc; } @@ -276,12 +276,13 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons const bool useWeightsDecompression) { const auto normalizedInputDesc = normalizeDescriptor(inputDesc); const auto normalizedOutputDesc = normalizeDescriptor(outputDesc); + const auto normalizedWeightDesc = normalizeDescriptor(weightDesc); const auto indt = normalizedInputDesc.get_data_type(); auto wdt = indt; if (useWeightsDecompression) { - wdt = weightDesc.get_data_type(); + wdt = normalizedWeightDesc.get_data_type(); // dynamic quantization with symmetric quantized weights needs unsigned weights uint64_t dynQuantGroupSize = 0; @@ -297,8 +298,8 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons } const dnnl::memory::desc weightsDesc = - useSparseWeights ? dnnl::memory::desc().sparse_desc(weightDesc.get_dims(), wdt) - : dnnl::memory::desc(weightDesc.get_dims(), wdt, memory::format_tag::any); + useSparseWeights ? dnnl::memory::desc().sparse_desc(normalizedWeightDesc.get_dims(), wdt) + : dnnl::memory::desc(normalizedWeightDesc.get_dims(), wdt, memory::format_tag::any); return dnnl::inner_product_forward::primitive_desc(engine, dnnl::prop_kind::forward_inference, @@ -387,8 +388,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs& useWeightsDecompression && useDynamicQuantizationImpl(attrs.dynamicQuantizationGroupSize, srcDesc, weiDesc, - attrs.decompressionMultiplyPtr, - attrs.decompressionSubtractPtr, + memory, !attrs.weightsNonTransposed); const auto postOpData = createPrimitiveAttrs(attrs, postOps, memory, context, useDynamicQuantization); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp index 5295b9655066cc..21247f149ca69f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp @@ -75,13 +75,6 @@ class DnnlFCPrimitive { const DnnlShapeAgnosticDataPtr& shapeAgnosticData); private: - static bool useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose); - dnnl::stream m_stream; dnnl::primitive_desc m_primDesc; impl_desc_type m_implType; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp index 1b8646c858e532..86b22607111833 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp @@ -27,6 +27,7 @@ #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/matmul_config.hpp" #include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -104,10 +105,10 @@ DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const Dnnl const auto& weiDesc = srcDesc->getDnnlDesc(); auto wDims = weiDesc.get_dims(); auto wDataType = weiDesc.get_data_type(); - std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]); + dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims); const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba; - const auto transposedWeiDesc = dnnl::memory::desc{wDims, wDataType, format}; + const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, wDataType, format}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -128,15 +129,8 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs, one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); - DnnlPostOpsComposer dnnlpoc(postOps, - context->getEngine(), - dims, - dims.size() - 1, - isINT8, - 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), - outputDataType); + DnnlPostOpsComposer + dnnlpoc(postOps, context->getEngine(), dims, dims.size() - 1, isINT8, 1 << 0, memory, outputDataType); return dnnlpoc.compose(); } @@ -185,8 +179,7 @@ static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory: wdt = memory::data_type::s8; } - const dnnl::memory::desc weightsDesc = - dnnl::memory::desc(weiDims, wdt, memory::format_tag::any); + const dnnl::memory::desc weightsDesc = dnnl::memory::desc(weiDims, wdt, memory::format_tag::any); return dnnl::matmul::primitive_desc(engine, inputsDesc, weightsDesc, newBiasDesc, outputsDesc, attr); } @@ -262,7 +255,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& biasDesc = memory.at(ARG_BIAS)->getDescPtr(); auto dstDesc = memory.at(ARG_DST)->getDescPtr(); - MatMulAttrs mmAttrs{false, false, attrs.dequantizationScales}; + MatMulAttrs mmAttrs{false, false}; const auto postOpData = createPrimitiveAttrs(mmAttrs, postOps, memory, context, false); @@ -335,7 +328,8 @@ DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key, m_prim(primitive(m_primDesc)) {} void DnnlMatMulPrimitive::execute(const dnnl_primitive_args& primArgs) const { - std::cout << "Executing MM primitive" << "\n"; + std::cout << "Executing MM primitive" + << "\n"; m_prim.execute(m_stream, primArgs); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp index d76e1984bd87d9..6a1b128be307ce 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp @@ -12,8 +12,7 @@ namespace ov { namespace intel_cpu { struct DnnlShapeAgnosticData { - DnnlShapeAgnosticData(DnnlPrimitiveAttrs primAttrs) - : primAttrs(std::move(primAttrs)) {} + DnnlShapeAgnosticData(DnnlPrimitiveAttrs primAttrs) : primAttrs(std::move(primAttrs)) {} DnnlPrimitiveAttrs primAttrs; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp index fa273ac3d6c3ff..f23fd317d3546d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp @@ -8,8 +8,8 @@ #include #include "cpu_memory.h" -#include "memory_desc/dnnl_memory_desc.h" #include "memory_desc/cpu_memory_desc_utils.h" +#include "memory_desc/dnnl_memory_desc.h" #include "nodes/executors/executor.hpp" #include "nodes/reorder.h" #include "utils/cpu_utils.hpp" @@ -79,9 +79,9 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr srcWeightDesc, auto globalWeightCache = context->getWeightsCache(); MemoryPtr ptr; - if (globalWeightCache && - dnnl::memory::format_kind::blocked == dstWeightDesc->getDnnlDesc().get_format_kind()) { - ptr = *globalWeightCache->findOrCreate(DnnlExtensionUtils::computeWeightsStringHash(weightsMem, dstWeightDesc), create); + if (globalWeightCache && dnnl::memory::format_kind::blocked == dstWeightDesc->getDnnlDesc().get_format_kind()) { + ptr = *globalWeightCache->findOrCreate(DnnlExtensionUtils::computeWeightsStringHash(weightsMem, dstWeightDesc), + create); } else { ptr = create(); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp index 12bce382424e5c..8e7c470984b4f2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { EltwiseExecutor::EltwiseExecutor(const ExecutorContext::CPtr context) : context(context) {} -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp index 4b1271c49d5df0..b33c0eca10dae7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -19,10 +19,7 @@ struct EltwiseData { float gamma; bool operator==(const EltwiseData& rhs) const noexcept { - return algo == rhs.algo && - onednnAlgorithm == rhs.onednnAlgorithm && - alpha == rhs.alpha && - beta == rhs.beta && + return algo == rhs.algo && onednnAlgorithm == rhs.onednnAlgorithm && alpha == rhs.alpha && beta == rhs.beta && gamma == rhs.gamma; } }; @@ -34,24 +31,21 @@ struct EltwiseAttrs { float gamma; EltwiseAttrs() : algorithm(Algorithm::Default), alpha(0), beta(0), gamma(0) {} - EltwiseAttrs(Algorithm algorithm, float alpha, float beta, float gamma) : algorithm(algorithm), alpha(alpha), beta(beta), gamma(gamma) {} + EltwiseAttrs(Algorithm algorithm, float alpha, float beta, float gamma) + : algorithm(algorithm), + alpha(alpha), + beta(beta), + gamma(gamma) {} bool operator==(const EltwiseAttrs& rhs) const { bool retVal = true; - retVal = algorithm == rhs.algorithm && - alpha == rhs.alpha && - beta == rhs.beta && - gamma == rhs.gamma; + retVal = algorithm == rhs.algorithm && alpha == rhs.alpha && beta == rhs.beta && gamma == rhs.gamma; return retVal; } }; -enum class EltwisePostOpType { - Undefined, - Eltwise, - Dnnl -}; +enum class EltwisePostOpType { Undefined, Eltwise, Dnnl }; class EltwisePostOp { public: @@ -72,17 +66,20 @@ class EltwisePostOp { EltwisePostOpType type = EltwisePostOpType::Undefined; - bool operator==(const EltwisePostOp &rhs) const { - if (type != rhs.type) { return false; } + bool operator==(const EltwisePostOp& rhs) const { + if (type != rhs.type) { + return false; + } bool ret = true; switch (type) { - case EltwisePostOpType::Eltwise: - ret = eltwise == rhs.eltwise; - break; - case EltwisePostOpType::Dnnl: - ret = dnnlPostOps == rhs.dnnlPostOps; - break; - default: assert(!"unsupported eltwise post operation type"); + case EltwisePostOpType::Eltwise: + ret = eltwise == rhs.eltwise; + break; + case EltwisePostOpType::Dnnl: + ret = dnnlPostOps == rhs.dnnlPostOps; + break; + default: + assert(!"unsupported eltwise post operation type"); } return ret; } @@ -96,7 +93,9 @@ class EltwiseExecutor { const std::vector& dstDescs, const std::vector& postOps) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual ~EltwiseExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -121,5 +120,5 @@ class EltwiseExecutorBuilder { using EltwiseExecutorBuilderPtr = std::shared_ptr; using EltwiseExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp index 1bd6647310d387..5b9479bdf502b6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp @@ -10,11 +10,10 @@ namespace intel_cpu { const std::vector& getEltwiseExecutorsList() { static std::vector descs = { OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - OV_CPU_INSTANCE_SHL(ExecutorType::Shl, std::make_shared()) - }; + OV_CPU_INSTANCE_SHL(ExecutorType::Shl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp index 618e3499dc10a7..ac5c27c0ad36dc 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp @@ -4,19 +4,18 @@ #pragma once -#include "executor.hpp" - #include "eltwise.hpp" +#include "executor.hpp" #if defined(OV_CPU_WITH_ACL) -#include "aarch64/jit_eltwise.hpp" -#include "acl/acl_eltwise.hpp" +# include "aarch64/jit_eltwise.hpp" +# include "acl/acl_eltwise.hpp" #endif #if defined(OV_CPU_WITH_SHL) -#include "shl/shl_eltwise.hpp" +# include "shl/shl_eltwise.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -31,9 +30,10 @@ const std::vector& getEltwiseExecutorsList(); class EltwiseExecutorFactory : public ExecutorFactoryLegacy { public: EltwiseExecutorFactory(const EltwiseAttrs& eltwiseAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getEltwiseExecutorsList()) { if (desc.builder->isSupported(eltwiseAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -43,9 +43,9 @@ class EltwiseExecutorFactory : public ExecutorFactoryLegacy { ~EltwiseExecutorFactory() = default; virtual EltwiseExecutorPtr makeExecutor(const EltwiseAttrs& eltwiseAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const std::vector& postOps) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const std::vector& postOps) { auto build = [&](const EltwiseExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(eltwiseAttrs, srcDescs, dstDescs, postOps)) { @@ -84,5 +84,5 @@ class EltwiseExecutorFactory : public ExecutorFactoryLegacy { using EltwiseExecutorFactoryPtr = std::shared_ptr; using EltwiseExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/executor.cpp index 236f51c6d16149..399dab3d5499b9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.cpp @@ -2,15 +2,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - #include "executor.hpp" +#include + namespace ov { namespace intel_cpu { std::string ExecutorTypeToString(const ExecutorType type) { -#define CASE(_type) case ExecutorType::_type: return #_type; +#define CASE(_type) \ + case ExecutorType::_type: \ + return #_type; switch (type) { CASE(Undefined); CASE(Graph); @@ -27,7 +29,10 @@ std::string ExecutorTypeToString(const ExecutorType type) { } ExecutorType ExecutorTypeFromString(const std::string& typeStr) { -#define CASE(_type) if (typeStr == #_type) { return ExecutorType::_type; } +#define CASE(_type) \ + if (typeStr == #_type) { \ + return ExecutorType::_type; \ + } CASE(Undefined); CASE(Graph); CASE(Common); @@ -41,5 +46,5 @@ ExecutorType ExecutorTypeFromString(const std::string& typeStr) { return ExecutorType::Undefined; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp index 5b9df5a6e77a55..16a419c95d5efc 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp @@ -4,15 +4,15 @@ #pragma once -#include "openvino/core/except.hpp" -#include "openvino/core/visibility.hpp" #include #include "cache/multi_cache.h" #include "cpu_memory.h" #include "graph_context.h" -#include "onednn/iml_type_mapper.h" #include "memory_arguments.hpp" +#include "onednn/iml_type_mapper.h" +#include "openvino/core/except.hpp" +#include "openvino/core/visibility.hpp" namespace ov { namespace intel_cpu { @@ -24,25 +24,25 @@ namespace intel_cpu { #endif #if defined(OV_CPU_WITH_ACL) -# if defined(OPENVINO_ARCH_ARM) -# define OV_CPU_INSTANCE_ACL32(...) {__VA_ARGS__}, -# else -# define OV_CPU_INSTANCE_ACL32(...) -# endif -# if defined(OPENVINO_ARCH_ARM64) -# define OV_CPU_INSTANCE_ACL64(...) {__VA_ARGS__}, -# else -# define OV_CPU_INSTANCE_ACL64(...) -# endif -# if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) -# define OV_CPU_INSTANCE_ACL(...) {__VA_ARGS__}, -# else -# define OV_CPU_INSTANCE_ACL(...) -# endif +# if defined(OPENVINO_ARCH_ARM) +# define OV_CPU_INSTANCE_ACL32(...) {__VA_ARGS__}, +# else +# define OV_CPU_INSTANCE_ACL32(...) +# endif +# if defined(OPENVINO_ARCH_ARM64) +# define OV_CPU_INSTANCE_ACL64(...) {__VA_ARGS__}, +# else +# define OV_CPU_INSTANCE_ACL64(...) +# endif +# if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) +# define OV_CPU_INSTANCE_ACL(...) {__VA_ARGS__}, +# else +# define OV_CPU_INSTANCE_ACL(...) +# endif #else -# define OV_CPU_INSTANCE_ACL32(...) -# define OV_CPU_INSTANCE_ACL64(...) -# define OV_CPU_INSTANCE_ACL(...) +# define OV_CPU_INSTANCE_ACL32(...) +# define OV_CPU_INSTANCE_ACL64(...) +# define OV_CPU_INSTANCE_ACL(...) #endif #if defined(OV_CPU_WITH_DNNL) @@ -72,28 +72,11 @@ namespace intel_cpu { #define OV_CPU_INSTANCE_COMMON(...) {__VA_ARGS__}, // @todo another option is to determine shape relation by executor type -enum class ShapeTolerance { - Agnostic, - Dependant -}; +enum class ShapeTolerance { Agnostic, Dependant }; -enum class ExecutorType { - Undefined, - Graph, - Common, - jit_x64, - Dnnl, - Acl, - Mlas, - jit_aarch64, - Shl -}; +enum class ExecutorType { Undefined, Graph, Common, jit_x64, Dnnl, Acl, Mlas, jit_aarch64, Shl }; -enum class OperationType { - FullyConnected, - MatMul, - Convolution -}; +enum class OperationType { FullyConnected, MatMul, Convolution }; std::string ExecutorTypeToString(const ExecutorType type); ExecutorType ExecutorTypeFromString(const std::string& typeStr); @@ -112,8 +95,10 @@ class ExecutorContext { engine(graphContext->getEngine()), implPriorities(implPriorities), privateWeighCache(std::move(privateWeighCache)), - numNumaNodes(graphContext->getNumNumaNodes()) - {} + numNumaNodes(graphContext->getNumNumaNodes()) { + auto cpuStreamsExecutor = graphContext->getCPUStreamExecutor(); + curNumaNodeId = std::max(0, cpuStreamsExecutor ? cpuStreamsExecutor->get_numa_node_id() : curNumaNodeId); + } MultiCachePtr getRuntimeCache() const { auto runtimeCachePtr = runtimeCache.lock(); @@ -121,12 +106,8 @@ class ExecutorContext { return runtimeCachePtr; } - DnnlScratchPadPtr getScratchPad(int subStreamID = 0) const { - if (subStreamID < 0) - subStreamID = 0; - if (subStreamID >= numNumaNodes - 1) - subStreamID = numNumaNodes - 1; - return scratchPads[subStreamID]; + DnnlScratchPadPtr getScratchPad() const { + return scratchPads[curNumaNodeId]; } std::shared_ptr> getPrivateWeighCache() const { @@ -156,6 +137,7 @@ class ExecutorContext { // @todo remove after global cache is used exclusevly std::shared_ptr> privateWeighCache; int numNumaNodes; + int curNumaNodeId = -1; }; class ExecutorFactoryLegacy { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp index 09b3b33cfe6b2f..cd9bcaf7a119f7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp @@ -4,9 +4,8 @@ #pragma once -#include "post_ops.hpp" #include "memory_arguments.hpp" -#include "printers.hpp" +#include "post_ops.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp index 419ab4abf52cd7..dd05cc58d43c32 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2022 Intel Corporation +// Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,50 +6,21 @@ #include #include -#include #include "executor.hpp" -#include "nodes/executors/implementations.hpp" #include "nodes/executors/executor_config.hpp" #include "nodes/executors/executor_implementation.hpp" #include "nodes/executors/graph_emitter.hpp" +#include "nodes/executors/implementations.hpp" #include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/printers.hpp" -#include "openvino/core/except.hpp" +#include "nodes/executors/variable_executor.hpp" #include "post_ops.hpp" namespace ov { namespace intel_cpu { -using namespace executor; - -template -static ExecutorPtr fallback(const executor::Config& config, - const executor::Config& fallbackConfig, - const MemoryArgs& memory, - const ExecutorContext::CPtr context, - const std::string& name) { - DEBUG_LOG("Falling back to graph executor for ", - name, - ". Original config: ", - config, - " new config:", - fallbackConfig); - - GraphEmitter graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name); - - const auto& graphExecutor = - graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context) - .ensureAttrsMatch() - .ensureSrcDescsMatch() - .ensureDstDescsMatch() - .ensurePostOpsMatch() - .emit(); - (void)graphExecutor; - OPENVINO_THROW("Fallback logic is not implemented yet"); // return graphExecutor; -} - -template +template class ExecutorFactory { public: using ExecutorImplementationRef = std::reference_wrapper>; @@ -62,9 +33,7 @@ class ExecutorFactory { : m_attrs(attrs), m_postOps(postOps), m_context(context), - m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)), - m_implementationRequiresFallback(m_suitableImplementations.size(), true), - m_executors(m_suitableImplementations.size()) {} + m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)) {} /** * @brief Retrieves the proper memory descriptors based on the provided memory descriptors. @@ -95,104 +64,42 @@ class ExecutorFactory { } /** - * @brief Preconfigures an executor based on the provided memory arguments. - * - * Preconfigures an executor by selecting an appropriate implementation based on the provided - * memory arguments and by creating an executor using the implementation. - * - * @param memory The memory parameters used for selecting the appropriate executor implementation. - * - * @note The main use case is to offload executor data preparation (i.e. weights packing) - * From the make() call - * @todo Currently supports creating a single executor. - * For some nodes it can be worth to preconfigure all the executors. - */ - void preconfigure(const MemoryArgs& memory) { - executor::Config config{memoryDescsFromMemory(memory), m_attrs, m_postOps}; - - cacheFallbackStatus(config); - - const size_t implId = select(memory, 0); - const auto& impl = m_suitableImplementations[implId].get(); - DEBUG_LOG("Preconfiguring executor: ", impl.name()); - - if (m_implementationRequiresFallback[implId]) { - if (auto fallbackConfig = impl.requiresFallback(config)) { - fallback(config, *fallbackConfig, memory, m_context, impl.name()); - } - } - - (void)create(implId, memory, m_context); - } - - /** - * @brief Creates an Executor instance based on provided memory arguments. + * @brief Creates an Executor instance based on the provided memory arguments. * - * Creates an Executor instance using the provided MemoryArgs, selecting an appropriate implementation - * based on the characteristics of the memory. It handles fallback scenarios if necessary and updates the executor - * with the given memory information. + * Depending on the number of available implementations, returns: + * - VariableExecutor, if the number of implementations is two or more + * - Simple Executor, if there is only one available implementation * * @param memory memory arguments. * * @return A shared pointer to the created Executor. - * - * The function follows the steps below: - * - Selects an implementation based on the provided memory using the select() function. - * - Retrieves the selected implementation and checks if fallback is required. - * - If fallback is required, it creates a fallback configuration and returns a fallback executor. - * - Otherwise creates the executor using the selected implementation. - * - Updates the executor with the given memory information. - * */ - ExecutorPtr make(MemoryArgs& memory) { - auto createExec = [this](MemoryArgs& memory, size_t implId) -> ExecutorPtr { - const auto& impl = m_suitableImplementations[implId].get(); - if (m_implementationRequiresFallback[implId]) { - executor::Config config{memoryDescsFromMemory(memory), m_attrs, m_postOps}; - if (auto fallbackConfig = impl.requiresFallback(config)) { - return fallback(config, *fallbackConfig, memory, m_context, impl.name()); - } - } - const auto executor = create(implId, memory, m_context); - if (!executor->update(memory)) { - return nullptr; + ExecutorPtr make(const MemoryArgs& memory) { + // only single executor is available + if (m_suitableImplementations.size() == 1) { + auto config = GraphEmitter::createConfig(memory, m_attrs, m_postOps); + + const auto& theOnlyImplementation = m_suitableImplementations.front().get(); + + if (const auto fallbackConfig = theOnlyImplementation.requiresFallback(config)) { + return GraphEmitter::fallback(config, + *fallbackConfig, + memory, + m_context, + theOnlyImplementation.name()); } - return executor; - }; - - auto implId = select(memory, 0); - auto executor = createExec(memory, implId); - while (!executor) { - implId = select(memory, ++implId); - executor = createExec(memory, implId); - } - return executor; - } -private: - static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) { - MemoryDescArgs memoryDescs; - memoryDescs.reserve(memory.size()); - - for (const auto& mem : memory) { - memoryDescs[mem.first] = mem.second->getDescPtr(); + return theOnlyImplementation.create(m_attrs, m_postOps, memory, m_context); } - return memoryDescs; - } - - /** - * @brief Caches the fallback status for each suitable implementation. - */ - void cacheFallbackStatus(const executor::Config& config) { - std::transform(m_suitableImplementations.begin(), - m_suitableImplementations.end(), - m_implementationRequiresFallback.begin(), - [&config](const ExecutorImplementationRef& impl) { - return impl.get().requiresFallback(config); - }); + return std::make_shared>(memory, + m_attrs, + m_postOps, + m_context, + m_suitableImplementations); } +private: /** * @brief Filters and retrieves suitable implementations based on the provided executor configuration. * @@ -205,11 +112,10 @@ class ExecutorFactory { * @note If an implementation is shape agnostic, no further implementations with lower * priority are considered. */ - static std::vector filter( - const Attrs& attrs, - const PostOps& postOps, - const MemoryDescArgs& descs, - const std::string& implementationPriority = {}) { + static std::vector filter(const Attrs& attrs, + const PostOps& postOps, + const MemoryDescArgs& descs, + const std::string& implementationPriority = {}) { const auto& implementations = getImplementations(); std::vector suitableImplementations; const executor::Config config{descs, attrs, postOps}; @@ -244,51 +150,17 @@ class ExecutorFactory { return suitableImplementations; } - size_t select(const MemoryArgs& memory, const size_t startIdx) const { - OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(), - "Failed to find an implementation since start indx: ", startIdx, - " is out of range of the suitable implementations array: ", m_suitableImplementations.size()); - auto startIt = m_suitableImplementations.begin(); - std::advance(startIt, startIdx); - const auto selectedImplementation = - std::find_if(startIt, - m_suitableImplementations.end(), - [&memory](const ExecutorImplementationRef& implementation) { - return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory); - }); - OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation"); - - return std::distance(m_suitableImplementations.begin(), selectedImplementation); - } - - ExecutorPtr create(const size_t implId, - const MemoryArgs& memory, - const ExecutorContext::CPtr context) { - assert(implId < m_executors.size() && implId < m_suitableImplementations.size()); - - if (!m_executors[implId]) { - const auto& impl = m_suitableImplementations[implId].get(); - m_executors[implId] = impl.create(m_attrs, m_postOps, memory, context); - } - - return m_executors[implId]; - } - const Attrs& m_attrs; const PostOps& m_postOps; const ExecutorContext::CPtr m_context; std::vector m_suitableImplementations; - // stores fallback status to avoid performing the check for every make() call - std::vector m_implementationRequiresFallback; - // executors cache - std::vector m_executors; }; -template -using ExecutorFactoryPtr = std::shared_ptr>; +template +using ExecutorFactoryPtr = std::shared_ptr>; -template -using ExecutorFactoryCPtr = std::shared_ptr>; +template +using ExecutorFactoryCPtr = std::shared_ptr>; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp index 3459d1fe35e19e..07a58b0fa6cfa7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp @@ -19,22 +19,22 @@ template class ExecutorImplementation { public: using SupportsPredicate = std::function&)>; - using RequiresFallbackPredicate = std::function>(const executor::Config&)>; + using RequiresFallbackPredicate = + std::function>(const executor::Config&)>; using AcceptsShapePredicate = std::function; using CreateFunction = std::function; - ExecutorImplementation( - const char* name, - const ExecutorType type, - const OperationType operationType, - const ShapeTolerance shapeRelation, - SupportsPredicate supports, - RequiresFallbackPredicate requiresFallback, - AcceptsShapePredicate acceptsShape, - CreateFunction create) + ExecutorImplementation(const char* name, + const ExecutorType type, + const OperationType operationType, + const ShapeTolerance shapeRelation, + SupportsPredicate supports, + RequiresFallbackPredicate requiresFallback, + AcceptsShapePredicate acceptsShape, + CreateFunction create) : m_name(name), m_type(type), m_operationType(operationType), diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp index ad6479597c6971..1699a845a3314b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp @@ -19,13 +19,8 @@ struct FCAttrs { bool withBias = false; bool weightsNonTransposed = false; bool sparseWeights = false; - // @todo only memory descriptors should be a part of attributes - // actual memory should be passed into "execute" or "prepareMemory" calls - std::vector dequantizationScales; - // @todo should be passed as an additional memory input? - MemoryCPtr decompressionSubtractPtr; - MemoryCPtr decompressionMultiplyPtr; uint64_t dynamicQuantizationGroupSize; + ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 5834c3dda4b262..bc55af8cfbb0e2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -9,15 +9,17 @@ #include "debug_messages.hpp" #include "implementation_utils.hpp" #include "memory_desc/cpu_memory_desc.h" +#include "nodes/executors/common/common_utils.hpp" #include "nodes/executors/convolution_config.hpp" #include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp" #include "nodes/executors/dnnl/dnnl_fullyconnected.hpp" +#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp" #include "nodes/executors/dnnl/dnnl_matmul_primitive.hpp" #include "nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp" #include "nodes/executors/executor.hpp" #include "nodes/executors/executor_implementation.hpp" -#include "nodes/executors/implementations.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "nodes/executors/implementations.hpp" #include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/mlas/mlas_gemm.hpp" #include "nodes/executors/precision_matcher.hpp" @@ -29,7 +31,8 @@ #include "utils/debug_capabilities.h" #if defined(OV_CPU_WITH_ACL) -#include "nodes/executors/acl/acl_fullyconnected.hpp" +# include "nodes/executors/acl/acl_fullyconnected.hpp" +# include "nodes/executors/acl/acl_lowp_fullyconnected.hpp" #endif #if defined(OV_CPU_WITH_SHL) @@ -49,7 +52,7 @@ using LayoutConfig = std::vector; static const LayoutConfig dnnlFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}; static const LayoutConfig aclFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}; -template +template struct Require { bool operator()() { return dnnl::impl::cpu::x64::mayiuse(ISA); @@ -88,6 +91,11 @@ static const TypeMapping aclFCTypeMapping { {{_any, _any, _any, _any}, pt(just(), just(), just(), just())} }; +static const TypeMapping aclLowpFCTypeMapping { + // {src, wei, bia, dst} pt + {{_i8, _i8, _any, _f32}, pt(bypass(), bypass(), use<3>(), bypass())} +}; + static const MappingNotation dnnlConvolutionMappingNotation { ARG_SRC, ARG_WEI, ARG_BIAS, ARG_DST }; @@ -143,10 +151,10 @@ static bool fullyMatchConfiguration(const MemoryDescArgs& currentDescriptors, continue; if (desc->getPrecision() != type) - return false; // type mismatch + return false; // type mismatch if (!desc->hasLayoutType(layoutConfig[i])) - return false; // layout mismatch + return false; // layout mismatch } return true; @@ -206,6 +214,8 @@ OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noPostOps(const FCConfig& config return config.postOps.empty(); } +// to keep OV_CPU_INSTANCE macros aligned +// clang-format off template <> const std::vector>& getImplementations() { static const std::vector> fullyconnectedImplementations { @@ -371,6 +381,38 @@ const std::vector>& getImplementations() { const ExecutorContext::CPtr context) { return std::make_shared(attrs, postOps, memory, context); }) + OV_CPU_INSTANCE_ACL( + "fullyconnected_acl_lowp", + ExecutorType::Acl, + OperationType::FullyConnected, + ShapeTolerance::Agnostic, + // supports + [](const FCConfig& config) -> bool { + VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS); + VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION); + return ACLLowpFullyConnectedExecutor::supports(config); + }, + // requiresFallback + [](const FCConfig& config) -> ov::optional> { + return requiresFallbackCommon(config, + aclLowpFCTypeMapping, + aclFCLayoutConfig, + aclFullyConnectedMappingNotation); + }, + // acceptsShapes + [](const MemoryArgs& memory) -> bool { + const auto dequantizationScales = getDeQuantizedScales(memory); + bool isPerChannelQuantization = dequantizationScales.size() > 1; + // per-channel quantization is not unsupported by ACL + return !isPerChannelQuantization; + }, + // create + [](const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr context) { + return std::make_shared(attrs, postOps, memory, context); + }) OV_CPU_INSTANCE_SHL( "fullyconnected_shl", ExecutorType::Shl, @@ -440,8 +482,7 @@ const std::vector>& getImplementations() { const ExecutorContext::CPtr context, std::shared_ptr shareAgnosticData) const { MatMulAttrs matMulAttrs{false, - false, - attrs.dequantizationScales}; + false}; auto primitive = DefaultInstantiator{}( memory, @@ -492,5 +533,7 @@ const std::vector>& getImplementations() { return fullyconnectedImplementations; } +// clang-format on + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp index 6aad18c793c8cf..347ac4c981f4f1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp @@ -5,12 +5,11 @@ #pragma once #include -#include #include "graph.h" -#include "memory_desc/cpu_memory_desc.h" #include "node.h" #include "nodes/executors/executor.hpp" +#include "nodes/executors/executor_config.hpp" #include "post_ops.hpp" namespace ov { @@ -72,6 +71,47 @@ class GraphEmitter { return graph; } + static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) { + MemoryDescArgs memoryDescs; + memoryDescs.reserve(memory.size()); + + for (const auto& mem : memory) { + memoryDescs[mem.first] = mem.second->getDescPtr(); + } + + return memoryDescs; + } + + static executor::Config createConfig(const MemoryArgs& memory, const Attrs& attrs, const PostOps& postOps) { + return executor::Config{memoryDescsFromMemory(memory), attrs, postOps}; + } + + static ExecutorPtr fallback(const executor::Config& config, + const executor::Config& fallbackConfig, + const MemoryArgs& memory, + const ExecutorContext::CPtr context, + const std::string& name) { + DEBUG_LOG("Falling back to graph executor for ", + name, + ". Original config: ", + config, + " new config:", + fallbackConfig); + + GraphEmitter graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name); + + const auto& graphExecutor = + graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context) + .ensureAttrsMatch() + .ensureSrcDescsMatch() + .ensureDstDescsMatch() + .ensurePostOpsMatch() + .emit(); + (void)graphExecutor; + + OPENVINO_THROW("Fallback logic is not implemented yet"); // return graphExecutor; + } + private: const MemoryDescArgs& descs; const Attrs& attrs; diff --git a/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp index cd029283a09c50..bee82af305c9d2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp @@ -5,6 +5,7 @@ #pragma once #include + #include "cpu_types.h" #include "memory_desc/cpu_memory_desc.h" #include "nodes/executors/memory_arguments.hpp" @@ -13,80 +14,80 @@ namespace ov { namespace intel_cpu { -template +template ov::element::Type memoryDescType(const Config& config) { return config.descs.at(idx)->getPrecision(); } -template +template ov::element::Type srcType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type weiType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type biaType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type dstType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type dims(const Config& config) { return config.descs.at(idx)->getShape().getDims(); } -template +template const VectorDims& srcDims(const Config& config) { return dims(config); } -template +template const VectorDims& weiDims(const Config& config) { return dims(config); } -template +template size_t rank(const Config& config) { return config.descs.at(idx)->getShape().getRank(); } -template +template size_t srcRank(const Config& config) { return rank(config); } -template +template size_t weiRank(const Config& config) { return rank(config); } -template +template size_t memSize(const Config& config) { return config.descs.at(idx)->getCurrentMemSize(); } -template +template size_t srcMemSize(const Config& config) { return memSize(config); } -template +template size_t weiMemSize(const Config& config) { return memSize(config); } -template +template size_t postOpsNumbers(const Config& config) { return config.postOps.size(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp index d0a006b1bea0fa..cb830a36f03cb1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp @@ -3,18 +3,19 @@ // #include "interpolate.hpp" -#include "openvino/core/parallel.hpp" -#include "nodes/common/cpu_memcpy.h" + #include "emitters/plugin/x64/jit_load_store_emitters.hpp" +#include "nodes/common/cpu_memcpy.h" +#include "openvino/core/parallel.hpp" using namespace ov::intel_cpu; bool ov::intel_cpu::InterpolateExecutor::init(const InterpolateAttrs& interpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { - const auto &srcDims = srcDescs[0]->getShape().getStaticDims(); - const auto &dstDims = dstDescs[0]->getShape().getStaticDims(); + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { + const auto& srcDims = srcDescs[0]->getShape().getStaticDims(); + const auto& dstDims = dstDescs[0]->getShape().getStaticDims(); interpAttrs = interpolateAttrs; srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpolateAttrs.padBegin, interpolateAttrs.padEnd)); dstDim5d = to5Dim(dstDims); @@ -24,38 +25,49 @@ bool ov::intel_cpu::InterpolateExecutor::init(const InterpolateAttrs& interpolat spatialDimSize = getSpatialDimsNum(dataRank); switch (interpAttrs.mode) { - case InterpolateMode::nearest: { - buildTblNN(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout, interpolateAttrs.nearestMode); - break; - } - case InterpolateMode::linear_onnx: { - buildTblLinearOnnx(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout); - break; - } - case InterpolateMode::linear: { - static constexpr int LINEAR_KERNEL = 2; - buildTblLinear(srcDimPad5d, dstDim5d, interpAttrs.dataScales, LINEAR_KERNEL, interpolateAttrs.antialias); - break; - } - case InterpolateMode::cubic: { - buildTblCubic(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.cubeCoeff, interpolateAttrs.layout); - break; - } - default: { - OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", interpAttrs.mode); - break; - } + case InterpolateMode::nearest: { + buildTblNN(srcDimPad5d, + dstDim5d, + interpAttrs.dataScales, + interpolateAttrs.layout, + interpolateAttrs.nearestMode); + break; + } + case InterpolateMode::linear_onnx: { + buildTblLinearOnnx(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout); + break; + } + case InterpolateMode::linear: { + static constexpr int LINEAR_KERNEL = 2; + buildTblLinear(srcDimPad5d, dstDim5d, interpAttrs.dataScales, LINEAR_KERNEL, interpolateAttrs.antialias); + break; + } + case InterpolateMode::cubic: { + buildTblCubic(srcDimPad5d, + dstDim5d, + interpAttrs.dataScales, + interpolateAttrs.cubeCoeff, + interpolateAttrs.layout); + break; + } + default: { + OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", interpAttrs.mode); + break; + } } return true; } // ===================================================================================================================== // index layout: // d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1 -void ov::intel_cpu::InterpolateExecutor::buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) { +void ov::intel_cpu::InterpolateExecutor::buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode) { const int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; - float fy = dataScales[dimSize - 2]; + float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; @@ -84,80 +96,91 @@ void ov::intel_cpu::InterpolateExecutor::buildTblNN(const VectorDims& srcDimPad5 // scale is float(outShape) / float(inShape) // strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline // the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode -float ov::intel_cpu::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const { +float ov::intel_cpu::InterpolateExecutor::coordTransToInput(int outCoord, + float scale, + int inShape, + int outShape) const { if (scale == 1.0f || (inShape == outShape)) { return outCoord; } switch (interpAttrs.coordTransMode) { - case InterpolateCoordTransMode::half_pixel: { + case InterpolateCoordTransMode::half_pixel: { + return (outCoord + 0.5f) / scale - 0.5f; + break; + } + case InterpolateCoordTransMode::pytorch_half_pixel: { + if (outShape > 1) return (outCoord + 0.5f) / scale - 0.5f; - break; - } - case InterpolateCoordTransMode::pytorch_half_pixel: { - if (outShape > 1) - return (outCoord + 0.5f) / scale - 0.5f; - else - return 0; - break; - } - case InterpolateCoordTransMode::asymmetric: { - return static_cast(outCoord) / scale; - break; - } - case InterpolateCoordTransMode::tf_half_pixel_for_nn: { - return (outCoord + 0.5f) / scale; - break; - } - case InterpolateCoordTransMode::align_corners: { - if (outShape > 1) - return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); - else - return 0; - break; - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); - break; - } + else + return 0; + break; + } + case InterpolateCoordTransMode::asymmetric: { + return static_cast(outCoord) / scale; + break; + } + case InterpolateCoordTransMode::tf_half_pixel_for_nn: { + return (outCoord + 0.5f) / scale; + break; + } + case InterpolateCoordTransMode::align_corners: { + if (outShape > 1) + return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); + else + return 0; + break; + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); + break; + } } } -int ov::intel_cpu::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const { +int ov::intel_cpu::InterpolateExecutor::nearestRound(float originCoord, + bool isDownsample, + InterpolateNearestMode nearestMode) const { switch (nearestMode) { - case InterpolateNearestMode::round_prefer_floor: { - if (originCoord == (static_cast(originCoord) + 0.5f)) - return static_cast(std::floor(originCoord)); - else - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::round_prefer_ceil: { - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::floor: { + case InterpolateNearestMode::round_prefer_floor: { + if (originCoord == (static_cast(originCoord) + 0.5f)) return static_cast(std::floor(originCoord)); - break; - } - case InterpolateNearestMode::ceil: { + else + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::round_prefer_ceil: { + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::floor: { + return static_cast(std::floor(originCoord)); + break; + } + case InterpolateNearestMode::ceil: { + return static_cast(std::ceil(originCoord)); + break; + } + case InterpolateNearestMode::simple: { + if (isDownsample) return static_cast(std::ceil(originCoord)); - break; - } - case InterpolateNearestMode::simple: { - if (isDownsample) - return static_cast(std::ceil(originCoord)); - else - return static_cast(originCoord); - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); - break; - } + else + return static_cast(originCoord); + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); + break; + } } } -void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape, - int& index0, int& index1, float& weight0, float& weight1) { +void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1) { float inCoord = coordTransToInput(outCoord, scale, inShape, outShape); inCoord = std::max(0.0f, std::min(inCoord, static_cast(inShape - 1))); index0 = std::min(static_cast(inCoord), inShape - 1); @@ -171,8 +194,10 @@ void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, } } -void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout) { +void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout) { int dimSize = dataRank; float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f; float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f; @@ -231,7 +256,7 @@ void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& sr indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale; weightPtr[0][idxOzOyOx] = weightL; weightPtr[1][idxOzOyOx] = weightR; - if (spatialDimSize > 1) { + if (spatialDimSize > 1) { indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale; indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale; weightPtr[2][idxOzOyOx] = weightT; @@ -284,8 +309,11 @@ void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& sr // wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw // | | // wh0.....wh_diameter ih0.....ih_diameter -void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, int kernel_width, bool antialias) { +void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, + bool antialias) { int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; float fy = dataScales[dimSize - 2]; @@ -309,15 +337,15 @@ void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const VectorDims& srcDim int sizeOH = OH * diaOH; int sizeOW = OW * diaOW; indexTable.resize((sizeOD + sizeOH + sizeOW) * 2); - float *weightTable = reinterpret_cast(&indexTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + float* weightTable = reinterpret_cast(&indexTable[0]); + float* weightOD = static_cast(&weightTable[0]); + float* weightOH = static_cast(&weightTable[sizeOD]); + float* weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - int *idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + int* idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); + int* idxOD = static_cast(&idxTable[0]); + int* idxOH = static_cast(&idxTable[sizeOD]); + int* idxOW = static_cast(&idxTable[sizeOD + sizeOH]); for (int oz = 0; oz < static_cast(OD); oz++) { float iz = coordTransToInput(oz, fz, ID, OD); @@ -375,8 +403,11 @@ std::vector ov::intel_cpu::InterpolateExecutor::getCubicCoeffs(float mant // table layout: // OW OW OW OW OW OH OH OH OH OH // x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3 -void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout) { +void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout) { int dimSize = dataRank; float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; @@ -394,9 +425,9 @@ void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimP } int tblAdvance = 0; - int *xOrigin = static_cast(&indexTable[tblAdvance]); + int* xOrigin = static_cast(&indexTable[tblAdvance]); tblAdvance += OW; - float *xFactor = reinterpret_cast(&indexTable[tblAdvance]); + float* xFactor = reinterpret_cast(&indexTable[tblAdvance]); for (int ox = 0; ox < OW; ox++) { float ix = coordTransToInput(ox, fx, IW, OW); int ix_r = static_cast(std::floor(ix)); @@ -410,9 +441,9 @@ void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimP } tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&indexTable[tblAdvance]); + int* yOrigin = static_cast(&indexTable[tblAdvance]); tblAdvance += OH; - float *yFactor = reinterpret_cast(&indexTable[tblAdvance]); + float* yFactor = reinterpret_cast(&indexTable[tblAdvance]); for (int oy = 0; oy < OH; oy++) { float iy = coordTransToInput(oy, fy, IH, OH); int iy_r = static_cast(std::floor(iy)); @@ -427,9 +458,9 @@ void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimP if (layout == InterpolateLayoutType::planar) { tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&indexTable[tblAdvance]); + int* sequenceOH = static_cast(&indexTable[tblAdvance]); tblAdvance += OH * OW; - int *sequenceOW = static_cast(&indexTable[tblAdvance]); + int* sequenceOW = static_cast(&indexTable[tblAdvance]); for (int h = 0; h < OH; ++h) { int offset = h * OW; for (int w = 0; w < OW; ++w) { @@ -447,16 +478,17 @@ inline VectorDims getBlockND(const VectorDims& shape) { int shapeRank = shape.size(); VectorDims blockND(shapeRank + 1, 1); for (int i = shapeRank - 1; i >= 0; i--) { - blockND[i] = shape[i] * blockND[i+1]; + blockND[i] = shape[i] * blockND[i + 1]; } return blockND; } -const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vector& src, const std::vector& dst) { - const uint8_t *src_data_origin = src[0]->getDataAs(); +const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vector& src, + const std::vector& dst) { + const uint8_t* src_data_origin = src[0]->getDataAs(); - const auto &srcDim = src[0]->getStaticDims(); - const auto &dstDim = dst[0]->getStaticDims(); + const auto& srcDim = src[0]->getStaticDims(); + const auto& dstDim = dst[0]->getStaticDims(); size_t dimSize = srcDim.size(); auto srcDimPad = getSrcDimPad5d(); @@ -465,7 +497,7 @@ const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vect const auto dstDim5d = to5Dim(dstDim); const auto srcDataSize = src[0]->getDesc().getPrecision().size(); - const uint8_t *src_data = nullptr; + const uint8_t* src_data = nullptr; std::vector srcPadded; if (interpAttrs.hasPad) { int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0; @@ -479,23 +511,32 @@ const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vect if (interpAttrs.layout == InterpolateLayoutType::planar) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + - inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize; + const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + + inShapeBlock[3] * d + inShapeBlock[4] * h) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + + inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); }); src_data = src_data_pad; } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + - (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) + - inShapePadBlock[4] * (h + padB3) + - inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize; + const uint8_t* src = src_data_origin + + (inShapeBlock[1] * n + + (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * + srcDataSize; + uint8_t* srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + + (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + + inShapePadBlock[5] * (w + padB4)) * + srcDimPad5d[1] + + padB1) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); }); src_data = src_data_pad; @@ -504,23 +545,30 @@ const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vect size_t CB = div_up(srcDimPad5d[1], blkSize); size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize; srcPadded.resize(eltsTotal * srcDataSize, 0x0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) { OPENVINO_THROW("Interpolate layer with name does not support padding on batch and channel dimensions"); } - parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) { - const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (h * srcDim5d[4] * blkSize) * srcDataSize - + (w * blkSize) * srcDataSize; - uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize - + ((w + padB4) * blkSize) * srcDataSize; - cpu_memcpy(srcPad, src, blkSize * srcDataSize); - }); + parallel_for5d( + srcDim5d[0], + CB, + srcDim5d[2], + srcDim5d[3], + srcDim5d[4], + [&](int n, int cb, int d, int h, int w) { + const uint8_t* src = src_data_origin + + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize; + uint8_t* srcPad = + src_data_pad + + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize + ((w + padB4) * blkSize) * srcDataSize; + cpu_memcpy(srcPad, src, blkSize * srcDataSize); + }); src_data = src_data_pad; } } else { diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp index 15df4eed5f0471..041589c0ab9f6a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp @@ -11,41 +11,15 @@ namespace ov { namespace intel_cpu { -enum InterpolateLayoutType { - planar, - block, - by_channel -}; +enum InterpolateLayoutType { planar, block, by_channel }; -enum InterpolateMode { - nearest, - linear, - linear_onnx, - cubic, - bilinear_pillow, - bicubic_pillow -}; +enum InterpolateMode { nearest, linear, linear_onnx, cubic, bilinear_pillow, bicubic_pillow }; -enum InterpolateCoordTransMode { - half_pixel, - pytorch_half_pixel, - asymmetric, - tf_half_pixel_for_nn, - align_corners -}; +enum InterpolateCoordTransMode { half_pixel, pytorch_half_pixel, asymmetric, tf_half_pixel_for_nn, align_corners }; -enum class InterpolateNearestMode { - round_prefer_floor, - round_prefer_ceil, - floor, - ceil, - simple -}; +enum class InterpolateNearestMode { round_prefer_floor, round_prefer_ceil, floor, ceil, simple }; -enum class InterpolateShapeCalcMode { - sizes, - scales -}; +enum class InterpolateShapeCalcMode { sizes, scales }; struct InterpolateAttrs { InterpolateShapeCalcMode shapeCalcMode = InterpolateShapeCalcMode::sizes; @@ -63,9 +37,9 @@ struct InterpolateAttrs { bool hasPad = false; }; -inline VectorDims getPaddedInputShape(const VectorDims &srcDims, - const std::vector &padBegin, - const std::vector &padEnd) { +inline VectorDims getPaddedInputShape(const VectorDims& srcDims, + const std::vector& padBegin, + const std::vector& padEnd) { VectorDims paddedShape; int dataRank = srcDims.size(); for (int i = 0; i < dataRank; i++) { @@ -80,16 +54,16 @@ inline int clipCoord(int pos, int length) { inline size_t getSpatialDimsNum(const Dim rank) { switch (rank) { - case 1: - case 3: - return 1; - case 2: - case 4: - return 2; - case 5: - return 3; - default: - OPENVINO_THROW("Can't define number spatial"); + case 1: + case 3: + return 1; + case 2: + case 4: + return 2; + case 5: + return 3; + default: + OPENVINO_THROW("Can't define number spatial"); } } @@ -133,27 +107,49 @@ class InterpolateExecutor { virtual bool init(const InterpolateAttrs& interpolateAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr); - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + const dnnl::primitive_attr& attr); + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual impl_desc_type getImplType() const = 0; virtual ~InterpolateExecutor() = default; - VectorDims getSrcDimPad5d() const { return srcDimPad5d; } + VectorDims getSrcDimPad5d() const { + return srcDimPad5d; + } const uint8_t* padPreprocess(const std::vector& src, const std::vector& dst); private: - void buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout, InterpolateNearestMode nearestMode); - void buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, + void buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode); + void buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, InterpolateLayoutType layout); - void buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, int kernel_width, + void buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, bool antialias); - void buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, float cubicCoeff, + void buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, InterpolateLayoutType layout); float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; - void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1); + void linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1); std::vector getCubicCoeffs(float mantissa, float a); protected: @@ -180,5 +176,5 @@ class InterpolateExecutorBuilder { using InterpolateExecutorBuilderPtr = std::shared_ptr; using InterpolateExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp index 2362b644583763..21ae249757bf9c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getInterpolateExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp index 2ed16ea04b1852..a0c1fc240731fb 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp @@ -5,14 +5,13 @@ #pragma once #include "executor.hpp" - #include "interpolate.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_interpolate.hpp" +# include "acl/acl_interpolate.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -27,9 +26,10 @@ const std::vector& getInterpolateExecutorsList(); class InterpolateExecutorFactory : public ExecutorFactoryLegacy { public: InterpolateExecutorFactory(const InterpolateAttrs& InterpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getInterpolateExecutorsList()) { if (desc.builder->isSupported(InterpolateAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -39,9 +39,9 @@ class InterpolateExecutorFactory : public ExecutorFactoryLegacy { ~InterpolateExecutorFactory() = default; virtual InterpolateExecutorPtr makeExecutor(const InterpolateAttrs& interpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const InterpolateExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(interpolateAttrs, srcDescs, dstDescs, attr)) { @@ -52,7 +52,6 @@ class InterpolateExecutorFactory : public ExecutorFactoryLegacy { return ptr; }; - if (chosenDesc) { if (auto executor = build(chosenDesc)) { return executor; @@ -81,5 +80,5 @@ class InterpolateExecutorFactory : public ExecutorFactoryLegacy { using InterpolateExecutorFactoryPtr = std::shared_ptr; using InterpolateExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp index 9e484b24a2940e..e42bf3138bce91 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp @@ -12,7 +12,6 @@ namespace intel_cpu { struct MatMulAttrs { bool transposeA; bool transposeB; - std::vector dequantizationScales; }; using MatMulConfig = executor::Config; diff --git a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp index c04ca39e845ee1..05c3cf0d5df259 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp @@ -12,9 +12,9 @@ namespace ov { namespace intel_cpu { using MemoryDescArgs = std::unordered_map; -using MemoryArgs = std::unordered_map; +using MemoryArgs = std::unordered_map; -// @todo add more options +// basic inputs #define ARG_SRC_0 1 #define ARG_SRC ARG_SRC_0 #define ARG_SRC_1 2 @@ -24,6 +24,12 @@ using MemoryArgs = std::unordered_map; #define ARG_WEI_0 33 #define ARG_WEI ARG_WEI_0 #define ARG_BIAS 41 +// legacy dequantization scale +#define ARG_DST_DEQ_SCALE 53 +// scaling factors provided at execution time +#define ARG_ATTR_SCALES 4096 +// zero points provided at execution time +#define ARG_ATTR_ZERO_POINTS 8192 } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp index a03bfe2649413a..7e50c8086789a0 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp @@ -23,6 +23,10 @@ using namespace executor; using namespace dnnl; using namespace ov::element; +static Dim batchDim(const VectorDims& dims) { + return std::accumulate(dims.begin(), dims.end() - 1, 1, std::multiplies()); +} + static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, const ExecutorContext::CPtr context, const bool weightsTransposed) { @@ -31,14 +35,15 @@ static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, // Weights are transposed by MatMulConstTransposesExtraction // K is the IC of weight // the weight is reshaped to [-1, K] in ConvertMatMulToFC - const auto K = wgtDims[1]; - const auto N = wgtDims[0]; + Dim K = wgtDims.back(); + Dim N = batchDim(wgtDims); auto packedBsize = mlas_sgemm_pack_get_size(N, K); auto create = [&]() { float* weightPtr = weightsMemory->getDataAs(); size_t ldb = weightsTransposed ? K : N; + MemoryPtr _ptr = std::make_shared(context->getEngine(), intel_cpu::CpuBlockedMemoryDesc(i8, intel_cpu::Shape{packedBsize})); float* prepackedDst = _ptr->getDataAs(); @@ -66,21 +71,10 @@ bool MlasGemmExecutor::supports(const FCConfig& config) { DEBUG_LOG("MlasGemmExecutor: PostOps are not supported"); return false; } - const auto& weiDesc = config.descs.at(ARG_WEI); - const auto& dstDesc = config.descs.at(ARG_DST); - // MLAS cannot support weight dims > 2, e.g. [1,64,9,9] * [10,64,9,9] - const auto& weightsDims = weiDesc->getShape().getStaticDims(); - if (weightsDims.size() > 2) { - if (!std::all_of(weightsDims.begin() + 2, weightsDims.end(), [](const Dim dim) { - return dim == 1; - })) { - DEBUG_LOG("MlasGemmExecutor: weights dims > 2 are not supported"); - return false; - } - } + const auto& dstDesc = config.descs.at(ARG_DST); - if (config.attrs.withBias) { + if (!config.descs.at(ARG_BIAS)->empty()) { const auto& biaDesc = config.descs.at(ARG_BIAS); const auto& biasDims = biaDesc->getShape().getStaticDims(); const auto& outDims = dstDesc->getShape().getDims(); @@ -108,24 +102,16 @@ MlasGemmExecutor::MlasGemmExecutor(const FCAttrs& attrs, const ExecutorContext::CPtr context) : m_attrs(attrs), m_memoryArgs(memory), - packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)) {} + packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)), + N(batchDim(memory.at(ARG_WEI)->getStaticDims())), + K(memory.at(ARG_WEI)->getStaticDims().back()) {} bool MlasGemmExecutor::update(const MemoryArgs& memory) { - const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); - const auto& wgtDims = weiDesc->getShape().getStaticDims(); - // Weights are transposed by MatMulConstTransposesExtraction - // K is the IC of weight - // the weight is reshaped to [-1, K] in ConvertMatMulToFC - K = wgtDims[1]; - N = wgtDims[0]; const auto& outDims = dstDesc->getShape().getStaticDims(); - if (outDims.size() > 2) { - M = std::accumulate(outDims.begin(), outDims.end() - 1, 1, std::multiplies()); - } else { - M = outDims[0]; - } + M = outDims.size() > 2 ? batchDim(outDims) : outDims[0]; + return true; } diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp index 678fe5a5c22176..2b8b71bfbced0b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp @@ -3,9 +3,10 @@ // #include "mlas_transpose.hpp" -#include "openvino/core/parallel.hpp" -#include "nodes/common/cpu_memcpy.h" + #include "mlas.h" +#include "nodes/common/cpu_memcpy.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -24,7 +25,12 @@ struct has_mlas_transpose : std::true_type {}; template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisOutwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_writers, + int64_t writes_per_loop, + int64_t writes_per_writer_per_loop) { const T* end; for (int64_t l = 0; l < num_loops; ++l) { T* output_for_first_writer = output_data; @@ -44,9 +50,17 @@ typename std::enable_if::value, void>::type SimpleTranspo template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisOutwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_writers, + int64_t writes_per_loop, + int64_t writes_per_writer_per_loop) { for (int64_t l = 0; l < num_loops; ++l) { - MlasTranspose(input_data, output_data, static_cast(writes_per_writer_per_loop), static_cast(num_writers)); + MlasTranspose(input_data, + output_data, + static_cast(writes_per_writer_per_loop), + static_cast(num_writers)); input_data += writes_per_loop; output_data += writes_per_loop; } @@ -54,7 +68,12 @@ typename std::enable_if::value, void>::type SimpleTranspos template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisInwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_readers, + int64_t reads_per_loop, + int64_t reads_per_reader_per_loop) { T* end; for (int64_t l = 0; l < num_loops; ++l) { const T* input_for_first_reader = input_data; @@ -74,9 +93,17 @@ typename std::enable_if::value, void>::type SimpleTranspo template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisInwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_readers, + int64_t reads_per_loop, + int64_t reads_per_reader_per_loop) { for (int64_t l = 0; l < num_loops; ++l) { - MlasTranspose(input_data, output_data, static_cast(num_readers), static_cast(reads_per_reader_per_loop)); + MlasTranspose(input_data, + output_data, + static_cast(num_readers), + static_cast(reads_per_reader_per_loop)); input_data += reads_per_loop; output_data += reads_per_loop; } @@ -148,7 +175,10 @@ bool MlasTransposeExecutor::IsTransposeMovingSingleAxis(VectorDims permutations, return single_axis_moved; } -void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) { +void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, + const MemoryPtr& output, + size_t from, + size_t to) { const auto& input_shape = input->getShape(); const auto& input_dims = input_shape.getDims(); const auto element_size = input->getDesc().getPrecision().size(); @@ -165,52 +195,68 @@ void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, const size_t bytes_per_write = static_cast(block_size) * element_size; switch (bytes_per_write) { - case (sizeof(uint8_t)): { - SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop, - writes_per_writer_per_loop); - break; - } - case (sizeof(uint16_t)): { - SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_writers, - writes_per_loop, writes_per_writer_per_loop); - break; - } - case (sizeof(uint32_t)): { - SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_writers, - writes_per_loop, writes_per_writer_per_loop); - break; - } - case (sizeof(uint64_t)): { - SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_writers, - writes_per_loop, writes_per_writer_per_loop); - break; - } - default: { - // we need to use memcpy for each block - for (int64_t l = 0; l < num_loops; ++l) { - uint8_t* output_for_first_writer = output_data; + case (sizeof(uint8_t)): { + SimpleTransposeSingleAxisOutwards(input_data, + output_data, + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + case (sizeof(uint16_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + case (sizeof(uint32_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + case (sizeof(uint64_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + default: { + // we need to use memcpy for each block + for (int64_t l = 0; l < num_loops; ++l) { + uint8_t* output_for_first_writer = output_data; - for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) { - uint8_t* output_for_current_writer = output_for_first_writer; + for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) { + uint8_t* output_for_current_writer = output_for_first_writer; - for (uint64_t w = 0; w < num_writers; ++w) { - memcpy(output_for_current_writer, input_data, bytes_per_write); - // skip to output position for next writer - output_for_current_writer += (writes_per_writer_per_loop * bytes_per_write); - input_data += bytes_per_write; - } - output_for_first_writer += bytes_per_write; + for (uint64_t w = 0; w < num_writers; ++w) { + memcpy(output_for_current_writer, input_data, bytes_per_write); + // skip to output position for next writer + output_for_current_writer += (writes_per_writer_per_loop * bytes_per_write); + input_data += bytes_per_write; } - output_data += writes_per_loop * bytes_per_write; + output_for_first_writer += bytes_per_write; } + output_data += writes_per_loop * bytes_per_write; } } + } } -void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) { +void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, + const MemoryPtr& output, + size_t from, + size_t to) { const auto& input_shape = input->getShape(); const auto& input_dims = input_shape.getDims(); @@ -227,61 +273,74 @@ void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, const size_t bytes_per_read = static_cast(block_size) * element_size; switch (bytes_per_read) { - case (sizeof(uint8_t)): { - SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - case (sizeof(uint16_t)): { - SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - case (sizeof(uint32_t)): { - SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - case (sizeof(uint64_t)): { - SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - default: { - // we need to use memcpy for each block - for (int64_t l = 0; l < num_loops; ++l) { - const uint8_t* input_for_first_reader = input_data; - for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) { - const uint8_t* input_for_current_reader = input_for_first_reader; - for (uint64_t r = 0; r < num_readers; ++r) { - memcpy(output_data, input_for_current_reader, bytes_per_read); - output_data += bytes_per_read; - // skip to input position for next reader - input_for_current_reader += (reads_per_reader_per_loop * bytes_per_read); - } - input_for_first_reader += bytes_per_read; + case (sizeof(uint8_t)): { + SimpleTransposeSingleAxisInwards(input_data, + output_data, + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint16_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint32_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint64_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + default: { + // we need to use memcpy for each block + for (int64_t l = 0; l < num_loops; ++l) { + const uint8_t* input_for_first_reader = input_data; + for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) { + const uint8_t* input_for_current_reader = input_for_first_reader; + for (uint64_t r = 0; r < num_readers; ++r) { + memcpy(output_data, input_for_current_reader, bytes_per_read); + output_data += bytes_per_read; + // skip to input position for next reader + input_for_current_reader += (reads_per_reader_per_loop * bytes_per_read); } - input_data += reads_per_loop * bytes_per_read; + input_for_first_reader += bytes_per_read; } + input_data += reads_per_loop * bytes_per_read; } } + } } void MlasTransposeExecutor::exec(const std::vector& src, const std::vector& dst) { if (from > to) { - TransposeSingleAxisOutwards(src[0], dst[0], from, to); + TransposeSingleAxisOutwards(src[0], dst[0], from, to); } else { - TransposeSingleAxisInwards(src[0], dst[0], from, to); + TransposeSingleAxisInwards(src[0], dst[0], from, to); } } -bool MlasTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool MlasTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { if (!IsTransposeMovingSingleAxis(transposeParams.permuteParams.order, from, to)) { DEBUG_LOG("MLAS Transpose executor supports moving single axis only"); return false; @@ -292,8 +351,7 @@ bool MlasTransposeExecutor::init(const TransposeParams &transposeParams, bool MlasTransposeExecutorBuilder::isSupported(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs) const { - if (!srcDescs[0]->hasLayoutType(LayoutType::ncsp) || - !dstDescs[0]->hasLayoutType(LayoutType::ncsp)) { + if (!srcDescs[0]->hasLayoutType(LayoutType::ncsp) || !dstDescs[0]->hasLayoutType(LayoutType::ncsp)) { DEBUG_LOG("MLAS Transpose executor supports NCHW layout only"); return false; } @@ -308,5 +366,5 @@ TransposeExecutorPtr MlasTransposeExecutorBuilder::makeExecutor(const ExecutorCo return std::make_shared(context); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp index d7e0307414aac9..8f7cd1bf8c22bd 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp @@ -11,13 +11,16 @@ namespace intel_cpu { class MlasTransposeExecutor : public TransposeExecutor { public: using TransposeExecutor::TransposeExecutor; - bool init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) override; - void exec(const std::vector &src, const std::vector &dst) override; + bool init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) override; + void exec(const std::vector& src, const std::vector& dst) override; + + impl_desc_type implType() const override { + return impl_desc_type::mlas; + } - impl_desc_type implType() const override { return impl_desc_type::mlas; } private: static int64_t calcShapeSize(const Shape& shape, size_t start, size_t end); static bool IsTransposeMovingSingleAxis(VectorDims permutations, size_t& from, size_t& to); @@ -37,5 +40,5 @@ class MlasTransposeExecutorBuilder : public TransposeExecutorBuilder { TransposeExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp b/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp index 9b522ed9887344..eec9d2a8947975 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp @@ -11,26 +11,34 @@ MVNExecutor::MVNExecutor(const ExecutorContext::CPtr context) : context(context) VectorDims MVNExecutor::transformTo5DCase(const VectorDims& shape, bool initAcrossChannels) { switch (shape.size()) { - // for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure. - // otherwise there are not enough data in spatial dimension to process in one kernel. - case 1 : // C - if (initAcrossChannels) { - return VectorDims({1, 1, 1, 1, shape[0]}); - } else { - return VectorDims({1, shape[0], 1, 1, 1}); - } - case 2 : // NC - if (initAcrossChannels) { - return VectorDims({1, shape[0], 1, shape[1], 1}); - } else { - return VectorDims({shape[0], shape[1], 1, 1, 1}); - } - case 3 : { return VectorDims({shape[0], shape[1], 1, shape[2], 1}); } - case 4 : { return VectorDims({shape[0], shape[1], 1, shape[2], shape[3]}); } - case 5 : { return VectorDims({shape[0], shape[1], shape[2], shape[3], shape[4]}); } - default : { OPENVINO_THROW("MVN executor doesn't support planar layout with rank: ", shape.size()); } + // for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure. + // otherwise there are not enough data in spatial dimension to process in one kernel. + case 1: // C + if (initAcrossChannels) { + return VectorDims({1, 1, 1, 1, shape[0]}); + } else { + return VectorDims({1, shape[0], 1, 1, 1}); + } + case 2: // NC + if (initAcrossChannels) { + return VectorDims({1, shape[0], 1, shape[1], 1}); + } else { + return VectorDims({shape[0], shape[1], 1, 1, 1}); + } + case 3: { + return VectorDims({shape[0], shape[1], 1, shape[2], 1}); + } + case 4: { + return VectorDims({shape[0], shape[1], 1, shape[2], shape[3]}); + } + case 5: { + return VectorDims({shape[0], shape[1], shape[2], shape[3], shape[4]}); + } + default: { + OPENVINO_THROW("MVN executor doesn't support planar layout with rank: ", shape.size()); + } } } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp b/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp index 759115a4b4b794..da51b5d1ef67e9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp @@ -5,29 +5,22 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { -enum MVNLayoutType { - mvn_planar, - mvn_block, - mvn_by_channel -}; +enum MVNLayoutType { mvn_planar, mvn_block, mvn_by_channel }; // Defines way to add epsilon: inside sqrt or outside. -enum MVNEpsMode { - INSIDE_SQRT, - OUTSIDE_SQRT -}; +enum MVNEpsMode { INSIDE_SQRT, OUTSIDE_SQRT }; struct MVNAttrs { MVNLayoutType layout = mvn_planar; bool initAcrossChannels_ = false; bool execAcrossChannels_ = false; - bool normalizeVariance_ = false; + bool normalizeVariance_ = false; float epsValue_ = 0.0f; MVNEpsMode epsMode_ = INSIDE_SQRT; ov::element::Type src_prc; @@ -40,9 +33,11 @@ class MVNExecutor { virtual bool init(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual ~MVNExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -60,12 +55,14 @@ using MVNExecutorCPtr = std::shared_ptr; class MVNExecutorBuilder { public: ~MVNExecutorBuilder() = default; - virtual bool isSupported(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const = 0; + virtual bool isSupported(const MVNAttrs& mvnAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const = 0; virtual MVNExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0; }; using MVNExecutorBuilderPtr = std::shared_ptr; using MVNExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp index c27751b7a2d2b4..99a55d79f58177 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getMVNExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp index 3a8d3cc61fe585..82f8e868ac2d81 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp @@ -5,14 +5,13 @@ #pragma once #include "executor.hpp" - #include "mvn.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_mvn.hpp" +# include "acl/acl_mvn.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -29,7 +28,8 @@ class MVNExecutorFactory : public ExecutorFactoryLegacy { MVNExecutorFactory(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getMVNExecutorsList()) { if (desc.builder->isSupported(mvnAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -41,7 +41,7 @@ class MVNExecutorFactory : public ExecutorFactoryLegacy { virtual MVNExecutorPtr makeExecutor(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const dnnl::primitive_attr& attr) { auto build = [&](const MVNExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(mvnAttrs, srcDescs, dstDescs, attr)) { @@ -80,5 +80,5 @@ class MVNExecutorFactory : public ExecutorFactoryLegacy { using MVNExecutorFactoryPtr = std::shared_ptr; using MVNExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp b/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp index e15d1a4ef15b8d..95448640e3b125 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { PoolingExecutor::PoolingExecutor(const ExecutorContext::CPtr context) : context(context) {} -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp b/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp index 5ea358c68afc8e..e826d3a37250db 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -44,9 +44,11 @@ class PoolingExecutor { virtual bool init(const PoolingAttrs& poolingAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, std::unordered_map postOpsArgs) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + std::unordered_map postOpsArgs) = 0; virtual ~PoolingExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -71,5 +73,5 @@ class PoolingExecutorBuilder { using PoolingExecutorBuilderPtr = std::shared_ptr; using PoolingExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp index 4b130f37bfff57..d0ee9f7da574c6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getPoolingExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp index d6ce5489105b19..1c051ae7d2959d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp @@ -5,10 +5,9 @@ #pragma once #include "executor.hpp" - #include "pooling.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_pooling.hpp" +# include "acl/acl_pooling.hpp" #endif namespace ov { @@ -24,9 +23,10 @@ const std::vector& getPoolingExecutorsList(); class PoolingExecutorFactory : public ExecutorFactoryLegacy { public: PoolingExecutorFactory(const PoolingAttrs& poolingAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getPoolingExecutorsList()) { if (desc.builder->isSupported(poolingAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -36,9 +36,9 @@ class PoolingExecutorFactory : public ExecutorFactoryLegacy { ~PoolingExecutorFactory() = default; virtual PoolingExecutorPtr makeExecutor(const PoolingAttrs& poolingAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const PoolingExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(poolingAttrs, srcDescs, dstDescs, attr)) { @@ -49,7 +49,6 @@ class PoolingExecutorFactory : public ExecutorFactoryLegacy { return ptr; }; - if (chosenDesc) { if (auto executor = build(chosenDesc)) { return executor; @@ -74,5 +73,5 @@ class PoolingExecutorFactory : public ExecutorFactoryLegacy { using PoolingExecutorFactoryPtr = std::shared_ptr; using PoolingExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp b/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp index 95044a9e205595..ced50dd2ec3dd5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp @@ -15,9 +15,12 @@ namespace intel_cpu { bool match(const InOutTypeMask& patterns, const InOutTypes& values) { assert(patterns.size() == values.size()); - return std::equal(values.begin(), values.end(), patterns.begin(), [](const ov::element::Type value, const TypeMask pattern) { - return pattern & value; - }); + return std::equal(values.begin(), + values.end(), + patterns.begin(), + [](const ov::element::Type value, const TypeMask pattern) { + return pattern & value; + }); return true; } diff --git a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp index 73aac151843b08..36aab4f8fddc77 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp @@ -14,7 +14,9 @@ namespace ov { namespace intel_cpu { -InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, const TypeMapping& mapping, const MappingNotation& notation) { +InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, + const TypeMapping& mapping, + const MappingNotation& notation) { InOutTypes types; std::transform(notation.begin(), notation.end(), std::back_inserter(types), [&descriptors](int id) { return descriptors.at(id)->getPrecision(); diff --git a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp index 374b584dd0ffb5..20e613eea2c236 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp @@ -18,24 +18,21 @@ namespace intel_cpu { template struct use { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { assert(bypassId < types.size()); return types[bypassId]; } }; struct bypass { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { return types[idx]; } }; template struct just { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { // ignore everything (void)types; (void)idx; @@ -45,8 +42,7 @@ struct just { template <> struct just { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { // ignore everything (void)types; (void)idx; @@ -58,11 +54,9 @@ using policy = std::function - PortsTranslation(Policies... policies) : - m_policies{policies...} {} + PortsTranslation(Policies... policies) : m_policies{policies...} {} - std::vector operator()( - const std::vector& types) const { + std::vector operator()(const std::vector& types) const { assert(types.size() == m_policies.size()); std::vector result; @@ -73,6 +67,7 @@ struct PortsTranslation { return result; } + private: std::vector m_policies; }; @@ -88,9 +83,7 @@ class TypeMappingEntry { public: using EnabledPredicate = std::function; - TypeMappingEntry(InOutTypeMask mask, - TypeTranslationFunction translation, - EnabledPredicate enabled = {}) + TypeMappingEntry(InOutTypeMask mask, TypeTranslationFunction translation, EnabledPredicate enabled = {}) : m_mask(std::move(mask)), m_translation(std::move(translation)), m_enabled(std::move(enabled)) {} @@ -121,7 +114,9 @@ using TypeMapping = std::vector; using MappingNotation = std::vector; using pt = PortsTranslation; -InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, const TypeMapping& mapping, const MappingNotation& notation); +InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, + const TypeMapping& mapping, + const MappingNotation& notation); } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/printers.cpp b/src/plugins/intel_cpu/src/nodes/executors/printers.cpp index ac52b25a069541..1bce932225827d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/printers.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/printers.cpp @@ -4,25 +4,27 @@ #ifdef CPU_DEBUG_CAPS -#include -#include "printers.hpp" -#include "post_ops.hpp" -#include "fullyconnected_config.hpp" +# include "printers.hpp" + +# include + +# include "fullyconnected_config.hpp" +# include "post_ops.hpp" namespace ov { namespace intel_cpu { -std::ostream & operator<<(std::ostream & os, const FCAttrs& attrs) { +std::ostream& operator<<(std::ostream& os, const FCAttrs& attrs) { // @todo print Attrs return os; } -std::ostream & operator<<(std::ostream & os, const PostOps& postOps) { +std::ostream& operator<<(std::ostream& os, const PostOps& postOps) { // @todo print PostOps return os; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/nodes/executors/printers.hpp b/src/plugins/intel_cpu/src/nodes/executors/printers.hpp index d37ab633ba8036..7a96550b3f225c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/printers.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/printers.hpp @@ -3,25 +3,27 @@ // #ifdef CPU_DEBUG_CAPS -#pragma once +# pragma once -#include -#include "executor_config.hpp" +# include + +# include "executor_config.hpp" namespace ov { namespace intel_cpu { namespace executor { -template struct Config; +template +struct Config; } struct FCAttrs; -std::ostream & operator<<(std::ostream & os, const FCAttrs& attrs); -std::ostream & operator<<(std::ostream & os, const PostOps& postOps); +std::ostream& operator<<(std::ostream& os, const FCAttrs& attrs); +std::ostream& operator<<(std::ostream& os, const PostOps& postOps); -template -std::ostream & operator<<(std::ostream & os, const executor::Config& config) { +template +std::ostream& operator<<(std::ostream& os, const executor::Config& config) { for (const auto& desc : config.descs) { const auto id = desc.first; const auto descPtr = desc.second; @@ -34,7 +36,7 @@ std::ostream & operator<<(std::ostream & os, const executor::Config& conf return os; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp b/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp index 8e091f0282eb5d..6039813d8fdd28 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { ReduceExecutor::ReduceExecutor(const ExecutorContext::CPtr context) : context(context) {} -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp b/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp index 8aa6e8f0aaa4ac..21b730a197df3a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp @@ -5,9 +5,9 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "dnnl_scratch_pad.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -24,9 +24,11 @@ class ReduceExecutor { virtual bool init(const ReduceAttrs& reduceAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual ~ReduceExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -51,5 +53,5 @@ class ReduceExecutorBuilder { using ReduceExecutorBuilderPtr = std::shared_ptr; using ReduceExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp index aec5c7eb905865..e6f035879a2cc6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getReduceExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp index ea2543a495e64c..faffdebc947c02 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp @@ -5,14 +5,13 @@ #pragma once #include "executor.hpp" - #include "reduce.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_reduce.hpp" +# include "acl/acl_reduce.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -27,9 +26,10 @@ const std::vector& getReduceExecutorsList(); class ReduceExecutorFactory : public ExecutorFactoryLegacy { public: ReduceExecutorFactory(const ReduceAttrs& reduceAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getReduceExecutorsList()) { if (desc.builder->isSupported(reduceAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -39,9 +39,9 @@ class ReduceExecutorFactory : public ExecutorFactoryLegacy { ~ReduceExecutorFactory() = default; virtual ReduceExecutorPtr makeExecutor(const ReduceAttrs& reduceAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const ReduceExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(reduceAttrs, srcDescs, dstDescs, attr)) { @@ -52,7 +52,6 @@ class ReduceExecutorFactory : public ExecutorFactoryLegacy { return ptr; }; - if (chosenDesc) { if (auto executor = build(chosenDesc)) { return executor; @@ -81,5 +80,5 @@ class ReduceExecutorFactory : public ExecutorFactoryLegacy { using ReduceExecutorFactoryPtr = std::shared_ptr; using ReduceExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp index 57e2e028827a62..b63e32e39ebf8d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp @@ -2,9 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "transpose.hpp" + #include + #include "openvino/core/parallel.hpp" -#include "transpose.hpp" namespace ov { namespace intel_cpu { @@ -33,27 +35,27 @@ jit_permute_config_params TransposeExecutor::prepareParams(const PermuteParams& } for (int i = tmp_order.size() - 1; i >= 0; i--) { - int pos = std::distance(std::find( - src_block_order.rbegin(), src_block_order.rend(), tmp_order[i]), src_block_order.rend() - 1); + int pos = std::distance(std::find(src_block_order.rbegin(), src_block_order.rend(), tmp_order[i]), + src_block_order.rend() - 1); if (pos != -1) { new_src_block_strides[i] = src_block_strides[pos]; src_block_order.erase(src_block_order.begin() + pos); src_block_strides.erase(src_block_strides.begin() + pos); mask[i] = 0; } else { - new_src_block_strides[i] = new_src_block_strides[tmp_order.size() - 1] * params.dst_block_dims[tmp_order.size() - 1]; + new_src_block_strides[i] = + new_src_block_strides[tmp_order.size() - 1] * params.dst_block_dims[tmp_order.size() - 1]; mask[i] = 1; mask[tmp_order.size() - 1] = 1; } } if (!src_block_order.empty()) { int pos = std::distance(tmp_order.begin(), std::find(tmp_order.begin(), tmp_order.end(), src_block_order[0])); - new_src_block_strides.insert(new_src_block_strides.begin() + pos, - src_block_strides[0]); - new_dst_block_strides.insert(new_dst_block_strides.begin() + pos, - new_dst_block_strides[pos] * params.src_block_dims[params.src_block_dims.size() - 1]); - new_dst_block_order.insert(new_dst_block_order.begin() + pos, - new_dst_block_order[pos]); + new_src_block_strides.insert(new_src_block_strides.begin() + pos, src_block_strides[0]); + new_dst_block_strides.insert( + new_dst_block_strides.begin() + pos, + new_dst_block_strides[pos] * params.src_block_dims[params.src_block_dims.size() - 1]); + new_dst_block_order.insert(new_dst_block_order.begin() + pos, new_dst_block_order[pos]); new_dst_block_dims.insert(new_dst_block_dims.begin() + pos + 1, params.src_block_dims[params.src_block_dims.size() - 1]); new_dst_block_dims[pos] = div_up(new_dst_block_dims[pos], new_dst_block_dims[pos + 1]); @@ -107,12 +109,12 @@ jit_permute_config_params TransposeExecutor::prepareParams(const PermuteParams& } int max_threads = parallel_get_max_threads(); - const int n_max = 3; // max count dims for parallel + const int n_max = 3; // max count dims for parallel int n = 0; int work_amount = sorted_dst_dims[0]; for (size_t i = 1; i < sorted_dst_dims.size() && n < n_max; i++) { n++; - if (work_amount >= 4 * max_threads) { // 4 * max_threads is a specially selected value for best performance + if (work_amount >= 4 * max_threads) { // 4 * max_threads is a specially selected value for best performance break; } work_amount *= sorted_dst_dims[i]; @@ -128,5 +130,5 @@ jit_permute_config_params TransposeExecutor::prepareParams(const PermuteParams& return jcp; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp index 15f2d5085cd5ad..99e0b0a2742a78 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp @@ -5,9 +5,9 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" #include "nodes/common/permute_kernel.h" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -23,8 +23,9 @@ class TransposeExecutor : public Executor { virtual bool init(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; virtual ~TransposeExecutor() = default; + protected: PermuteParams permuteParams; const ExecutorContext::CPtr context; @@ -44,5 +45,5 @@ class TransposeExecutorBuilder { using TransposeExecutorBuilderPtr = std::shared_ptr; using TransposeExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp index 31db070d04ffe3..f0e72f4bec1ae2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp @@ -9,20 +9,19 @@ namespace intel_cpu { const std::vector& getTransposeExecutorsList() { static const std::vector descs = { - OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) + OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - OV_CPU_INSTANCE_MLAS_ARM64(ExecutorType::Mlas, std::make_shared()) - OV_CPU_INSTANCE_X64(ExecutorType::jit_x64, std::make_shared()) - OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) - }; + OV_CPU_INSTANCE_MLAS_ARM64(ExecutorType::Mlas, std::make_shared()) + OV_CPU_INSTANCE_X64(ExecutorType::jit_x64, std::make_shared()) + OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared())}; return descs; } TransposeExecutorPtr TransposeExecutorFactory::makeExecutor(const TransposeParams& transposeParams, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const TransposeExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(transposeParams, srcDescs, dstDescs, attr)) { @@ -48,5 +47,5 @@ TransposeExecutorPtr TransposeExecutorFactory::makeExecutor(const TransposeParam OPENVINO_THROW("Supported executor is not found"); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp index 90141a6194592e..c81769fd1d0539 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp @@ -5,19 +5,17 @@ #pragma once #include "executor.hpp" - #include "transpose.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_transpose.hpp" +# include "acl/acl_transpose.hpp" #endif +#include "common/primitive_cache.hpp" #include "common/ref_opt_transpose.hpp" #include "common/ref_transpose.hpp" #include "mlas/mlas_transpose.hpp" -#include "x64/jit_transpose.hpp" - #include "onednn/iml_type_mapper.h" -#include "common/primitive_cache.hpp" +#include "x64/jit_transpose.hpp" namespace ov { namespace intel_cpu { @@ -31,22 +29,23 @@ const std::vector& getTransposeExecutorsList(); class TransposeExecutorFactory : public ExecutorFactoryLegacy { public: -TransposeExecutorFactory(const TransposeParams& transposeParams, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { - for (auto& desc : getTransposeExecutorsList()) { - if (desc.builder->isSupported(transposeParams, srcDescs, dstDescs)) { - supportedDescs.push_back(desc); + TransposeExecutorFactory(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { + for (auto& desc : getTransposeExecutorsList()) { + if (desc.builder->isSupported(transposeParams, srcDescs, dstDescs)) { + supportedDescs.push_back(desc); + } } } -} -~TransposeExecutorFactory() = default; -virtual TransposeExecutorPtr makeExecutor(const TransposeParams& transposeParams, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr); + ~TransposeExecutorFactory() = default; + virtual TransposeExecutorPtr makeExecutor(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr); private: std::vector supportedDescs; @@ -56,5 +55,5 @@ virtual TransposeExecutorPtr makeExecutor(const TransposeParams& transposeParams using TransposeExecutorFactoryPtr = std::shared_ptr; using TransposeExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp b/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp index d492bd6b6f368a..ef9fdac7f19208 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp @@ -14,29 +14,29 @@ namespace intel_cpu { struct TypeMask { enum Value : uint64_t { _undefined = 1 << 0, - _dynamic = 1 << 1, - _boolean = 1 << 2, - _bf16 = 1 << 3, - _f16 = 1 << 4, - _f32 = 1 << 5, - _f64 = 1 << 6, - _i4 = 1 << 7, - _i8 = 1 << 8, - _i16 = 1 << 9, - _i32 = 1 << 10, - _i64 = 1 << 11, - _u1 = 1 << 12, - _u4 = 1 << 13, - _u8 = 1 << 14, - _u16 = 1 << 15, - _u32 = 1 << 16, - _u64 = 1 << 17, - _nf4 = 1 << 18, - _f8e4m3 = 1 << 19, - _f8e5m2 = 1 << 20, - _string = 1 << 21, - _f4e2m1 = 1 << 22, - _f8e8m0 = 1 << 23, + _dynamic = 1 << 1, + _boolean = 1 << 2, + _bf16 = 1 << 3, + _f16 = 1 << 4, + _f32 = 1 << 5, + _f64 = 1 << 6, + _i4 = 1 << 7, + _i8 = 1 << 8, + _i16 = 1 << 9, + _i32 = 1 << 10, + _i64 = 1 << 11, + _u1 = 1 << 12, + _u4 = 1 << 13, + _u8 = 1 << 14, + _u16 = 1 << 15, + _u32 = 1 << 16, + _u64 = 1 << 17, + _nf4 = 1 << 18, + _f8e4m3 = 1 << 19, + _f8e5m2 = 1 << 20, + _string = 1 << 21, + _f4e2m1 = 1 << 22, + _f8e8m0 = 1 << 23, }; TypeMask(const ov::element::Type precision) : value(generateMask(precision)), precision(precision) {} diff --git a/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp new file mode 100644 index 00000000000000..8dfb7a4c63fde4 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp @@ -0,0 +1,140 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "executor.hpp" +#include "executor_config.hpp" +#include "executor_implementation.hpp" +#include "nodes/executors/graph_emitter.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * A stateful (variable) executor + * Contains two or more executors. + * Switches between the executors based on provided Memory (more precisely based on in / out shapes) + */ +template +class VariableExecutor : public Executor { +public: + using ExecutorImplementationRef = std::reference_wrapper>; + + VariableExecutor(const MemoryArgs& memory, + const Attrs& attrs, + const PostOps& postOps, + const ExecutorContext::CPtr context, + std::vector suitableImplementations) + : m_attrs(attrs), + m_postOps(postOps), + m_context(context), + m_suitableImplementations(std::move(suitableImplementations)), + m_implementationRequiresFallback( + cacheFallbackStatus(m_suitableImplementations, + GraphEmitter::createConfig(memory, m_attrs, m_postOps))), + m_executors(m_suitableImplementations.size()) { + const size_t implId = select(memory, 0); + m_executors[implId] = create(implId, memory); + m_implId = implId; + } + + bool update(const MemoryArgs& memory) override { + for (auto implId = select(memory, 0); implId < m_suitableImplementations.size(); + implId = select(memory, implId)) { + if (!m_executors[implId]) { + m_executors[implId] = create(implId, memory); + } + + if (m_executors[implId]->update(memory)) { + m_implId = implId; + return true; + } + } + + return false; + } + + void execute(const MemoryArgs& memory) override { + m_executors[m_implId]->execute(memory); + } + + impl_desc_type implType() const override { + return m_executors[m_implId]->implType(); + } + + void moveMemToNumaNode(int numaID) override { + m_executors[m_implId]->moveMemToNumaNode(numaID); + } + +private: + /** + * @brief Returns a fallback status for each suitable implementation. + */ + static std::vector cacheFallbackStatus(const std::vector& suitableImplementations, + const executor::Config& config) { + std::vector implementationRequiresFallback(suitableImplementations.size()); + std::transform(suitableImplementations.begin(), + suitableImplementations.end(), + implementationRequiresFallback.begin(), + [&config](const ExecutorImplementationRef& impl) { + return impl.get().requiresFallback(config); + }); + + return implementationRequiresFallback; + } + + size_t select(const MemoryArgs& memory, const size_t startIdx) const { + OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(), + "Failed to find an implementation since start indx: ", + startIdx, + " is out of range of the suitable implementations array: ", + m_suitableImplementations.size()); + + auto startIt = m_suitableImplementations.begin() + startIdx; + + const auto selectedImplementation = + std::find_if(startIt, + m_suitableImplementations.end(), + [&memory](const ExecutorImplementationRef& implementation) { + return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory); + }); + + OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation"); + + return std::distance(m_suitableImplementations.begin(), selectedImplementation); + } + + ExecutorPtr create(const size_t implId, const MemoryArgs& memory) { + assert(implId < m_executors.size() && implId < m_suitableImplementations.size()); + + auto createWithFallback = [this](const size_t implId, const MemoryArgs& memory) { + const auto& impl = m_suitableImplementations[implId].get(); + + if (m_implementationRequiresFallback[implId]) { + auto config = GraphEmitter::createConfig(memory, m_attrs, m_postOps); + if (auto fallbackConfig = impl.requiresFallback(config)) { + return GraphEmitter::fallback(config, *fallbackConfig, memory, m_context, impl.name()); + } + } + + return impl.create(m_attrs, m_postOps, memory, m_context); + }; + + return createWithFallback(implId, memory); + } + + const Attrs& m_attrs; + const PostOps& m_postOps; + const ExecutorContext::CPtr m_context; + std::vector m_suitableImplementations; + // stores fallback status to avoid performing the check for every make() call + std::vector m_implementationRequiresFallback; + // executors cache + std::vector m_executors; + size_t m_implId; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp index bfcc7ad4ae672a..79c578aaacda61 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp @@ -3,6 +3,7 @@ // #include "jit_transpose.hpp" + #include "cpu/x64/cpu_isa_traits.hpp" using namespace dnnl::impl::cpu; @@ -21,9 +22,10 @@ void JitTransposeExecutor::exec(const std::vector& src, const std::v pKernel->execute(srcData, dstData, MB); } -bool JitTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, const dnnl::primitive_attr &attr) { +bool JitTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { pKernel = std::make_shared(transposeParams.permuteParams); return true; } @@ -35,9 +37,9 @@ bool JitTransposeExecutorBuilder::isSupported(const TransposeParams& transposePa if (mayiuse(x64::sse41)) { return true; } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 return false; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp index d37ac9e5db5ef5..fd6d54257f1489 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp @@ -16,9 +16,12 @@ class JitTransposeExecutor : public TransposeExecutor { bool init(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return impl_desc_type::jit; } + impl_desc_type implType() const override { + return impl_desc_type::jit; + } + private: std::shared_ptr pKernel; }; @@ -33,5 +36,5 @@ class JitTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp index 441e013af2cbbf..dc58aabe26635d 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp @@ -2,12 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/experimental_detectron_detection_output.hpp" + #include #include -#include "openvino/op/experimental_detectron_detection_output.hpp" -#include "openvino/core/parallel.hpp" #include "experimental_detectron_detection_output.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -36,13 +37,19 @@ struct Indexer { } }; -static -void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores, - float* refined_boxes, float* refined_boxes_areas, float* refined_scores, - const int rois_num, const int classes_num, - const float img_H, const float img_W, - const float max_delta_log_wh, - float coordinates_offset) { +static void refine_boxes(const float* boxes, + const float* deltas, + const float* weights, + const float* scores, + float* refined_boxes, + float* refined_boxes_areas, + float* refined_scores, + const int rois_num, + const int classes_num, + const float img_H, + const float img_W, + const float max_delta_log_wh, + float coordinates_offset) { Indexer box_idx({rois_num, 4}); Indexer delta_idx({rois_num, classes_num, 4}); Indexer score_idx({rois_num, classes_num}); @@ -114,21 +121,22 @@ static bool SortScorePairDescend(const std::pair>& pa return (pair1.first > pair2.first) || ((pair1.first == pair2.first) && (pair1.second.second < pair2.second.second)); } - struct ConfidenceComparator { explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} bool operator()(int idx1, int idx2) { - if (_conf_data[idx1] > _conf_data[idx2]) return true; - if (_conf_data[idx1] < _conf_data[idx2]) return false; + if (_conf_data[idx1] > _conf_data[idx2]) + return true; + if (_conf_data[idx1] < _conf_data[idx2]) + return false; return idx1 < idx2; } const float* _conf_data; }; -static inline float JaccardOverlap(const float *decoded_bbox, - const float *bbox_sizes, +static inline float JaccardOverlap(const float* decoded_bbox, + const float* bbox_sizes, const int idx1, const int idx2, const float coordinates_offset = 1) { @@ -151,7 +159,7 @@ static inline float JaccardOverlap(const float *decoded_bbox, float intersect_xmax = (std::min)(xmax1, xmax2); float intersect_ymax = (std::min)(ymax1, ymax2); - float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; + float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset; if (intersect_width <= 0 || intersect_height <= 0) { @@ -165,7 +173,6 @@ static inline float JaccardOverlap(const float *decoded_bbox, return intersect_size / (bbox1_size + bbox2_size - intersect_size); } - static void nms_cf(const float* conf_data, const float* bboxes, const float* sizes, @@ -187,8 +194,10 @@ static void nms_cf(const float* conf_data, int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count)); - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, + std::partial_sort_copy(indices, + indices + count, + buffer, + buffer + num_output_scores, ConfidenceComparator(conf_data)); detections = 0; @@ -221,11 +230,13 @@ bool ExperimentalDetectronDetectionOutput::needPrepareParams() const { return false; } -bool ExperimentalDetectronDetectionOutput::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExperimentalDetectronDetectionOutput::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto doOp = ov::as_type_ptr(op); if (!doOp) { - errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; + errorMessage = + "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; return false; } } catch (...) { @@ -294,10 +305,17 @@ void ExperimentalDetectronDetectionOutput::execute(dnnl::stream strm) { Indexer refined_box_idx({classes_num_, rois_num, 4}); Indexer refined_score_idx({classes_num_, rois_num}); - refine_boxes(boxes, deltas, &deltas_weights_[0], scores, - &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0], - rois_num, classes_num_, - img_H, img_W, + refine_boxes(boxes, + deltas, + &deltas_weights_[0], + scores, + &refined_boxes[0], + &refined_boxes_areas[0], + &refined_scores[0], + rois_num, + classes_num_, + img_H, + img_W, max_delta_log_wh_, 1.0f); @@ -353,7 +371,7 @@ void ExperimentalDetectronDetectionOutput::execute(dnnl::stream strm) { memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0])); int i = 0; - for (const auto & detection : conf_index_class_map) { + for (const auto& detection : conf_index_class_map) { float score = detection.first; int cls = detection.second.first; int idx = detection.second.second; @@ -371,6 +389,6 @@ bool ExperimentalDetectronDetectionOutput::created() const { return getType() == Type::ExperimentalDetectronDetectionOutput; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h index 2f76f1004face5..206f807585de7d 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h @@ -14,25 +14,27 @@ class ExperimentalDetectronDetectionOutput : public Node { public: ExperimentalDetectronDetectionOutput(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; bool needShapeInfer() const override; bool needPrepareParams() const override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - const int INPUT_ROIS {0}; - const int INPUT_DELTAS {1}; - const int INPUT_SCORES {2}; - const int INPUT_IM_INFO {3}; + const int INPUT_ROIS{0}; + const int INPUT_DELTAS{1}; + const int INPUT_SCORES{2}; + const int INPUT_IM_INFO{3}; - const int OUTPUT_BOXES {0}; - const int OUTPUT_CLASSES {1}; - const int OUTPUT_SCORES {2}; + const int OUTPUT_BOXES{0}; + const int OUTPUT_CLASSES{1}; + const int OUTPUT_SCORES{2}; float score_threshold_; float nms_threshold_; @@ -44,6 +46,6 @@ class ExperimentalDetectronDetectionOutput : public Node { std::vector deltas_weights_; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp index 33f17c8d95f093..778e796aacc11a 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp @@ -2,22 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include #include #include +#include #include -#include #include -#include +#include #if defined(HAVE_AVX2) -#include +# include #endif -#include "openvino/op/experimental_detectron_generate_proposals.hpp" -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" #include "experimental_detectron_generate_proposals_single_image.h" +#include "openvino/core/parallel.hpp" +#include "openvino/op/experimental_detectron_generate_proposals.hpp" namespace ov { namespace intel_cpu { @@ -29,20 +29,29 @@ struct Indexer4d { int dim23_; int dim123_; - explicit Indexer4d(int dim0, int dim1, int dim2, int dim3): - dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) { + explicit Indexer4d(int dim0, int dim1, int dim2, int dim3) + : dim3_(dim3), + dim23_(dim2 * dim3), + dim123_(dim1 * dim2 * dim3) { (void)dim0; } int operator()(int i, int j, int k, int n) const { - return i * dim123_ + j * dim23_ + k * dim3_ + n; + return i * dim123_ + j * dim23_ + k * dim3_ + n; } }; -void refine_anchors(const float* deltas, const float* scores, const float* anchors, - float* proposals, const int anchors_num, const int bottom_H, - const int bottom_W, const float img_H, const float img_W, - const float min_box_H, const float min_box_W, +void refine_anchors(const float* deltas, + const float* scores, + const float* anchors, + float* proposals, + const int anchors_num, + const int bottom_H, + const int bottom_W, + const float img_H, + const float img_W, + const float min_box_H, + const float min_box_W, const float max_delta_log_wh, float coordinates_offset) { Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W); @@ -108,17 +117,22 @@ void refine_anchors(const float* deltas, const float* scores, const float* ancho void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) { parallel_for(pre_nms_topn, [&](size_t i) { - unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0]; - unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1]; - unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2]; - unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3]; - unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4]; + unpacked_boxes[0 * pre_nms_topn + i] = p_proposals[5 * i + 0]; + unpacked_boxes[1 * pre_nms_topn + i] = p_proposals[5 * i + 1]; + unpacked_boxes[2 * pre_nms_topn + i] = p_proposals[5 * i + 2]; + unpacked_boxes[3 * pre_nms_topn + i] = p_proposals[5 * i + 3]; + unpacked_boxes[4 * pre_nms_topn + i] = p_proposals[5 * i + 4]; }); } -void nms_cpu(const int num_boxes, int is_dead[], - const float* boxes, int index_out[], int* const num_out, - const int base_index, const float nms_thresh, const int max_num_out, +void nms_cpu(const int num_boxes, + int is_dead[], + const float* boxes, + int index_out[], + int* const num_out, + const int base_index, + const float nms_thresh, + const int max_num_out, float coordinates_offset) { const int num_proposals = num_boxes; int count = 0; @@ -131,9 +145,9 @@ void nms_cpu(const int num_boxes, int is_dead[], std::memset(is_dead, 0, num_boxes * sizeof(int)); #if defined(HAVE_AVX2) - __m256 vc_fone = _mm256_set1_ps(coordinates_offset); + __m256 vc_fone = _mm256_set1_ps(coordinates_offset); __m256i vc_ione = _mm256_set1_epi32(1); - __m256 vc_zero = _mm256_set1_ps(0.0f); + __m256 vc_zero = _mm256_set1_ps(0.0f); __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh); #endif @@ -154,13 +168,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1i = _mm256_set1_ps(x1[box]); __m256 vy1i = _mm256_set1_ps(y1[box]); - __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); + __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); __m256 vA_height = _mm256_sub_ps(vy1i, vy0i); - __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); + __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); for (; tail <= num_boxes - 8; tail += 8) { - __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail); - __m256i vdst = _mm256_loadu_si256(pdst); + __m256i* pdst = reinterpret_cast<__m256i*>(is_dead + tail); + __m256i vdst = _mm256_loadu_si256(pdst); __m256 vx0j = _mm256_loadu_ps(x0 + tail); __m256 vy0j = _mm256_loadu_ps(y0 + tail); @@ -172,13 +186,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1 = _mm256_min_ps(vx1i, vx1j); __m256 vy1 = _mm256_min_ps(vy1i, vy1j); - __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); + __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone); __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight)); - __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); + __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); __m256 vB_height = _mm256_sub_ps(vy1j, vy0j); - __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); + __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea); __m256 vintersection_area = _mm256_div_ps(varea, vdivisor); @@ -219,9 +233,9 @@ void nms_cpu(const int num_boxes, int is_dead[], const float y1 = std::min(y1i, y1j); // intersection area - const float width = std::max(0.0f, x1 - x0 + coordinates_offset); - const float height = std::max(0.0f, y1 - y0 + coordinates_offset); - const float area = width * height; + const float width = std::max(0.0f, x1 - x0 + coordinates_offset); + const float height = std::max(0.0f, y1 - y0 + coordinates_offset); + const float area = width * height; // area of A, B const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset); @@ -239,14 +253,18 @@ void nms_cpu(const int num_boxes, int is_dead[], *num_out = count; } -void fill_output_blobs(const float* proposals, const int* roi_indices, - float* rois, float* scores, - const int num_proposals, const int num_rois, const int post_nms_topn) { - const float *src_x0 = proposals + 0 * num_proposals; - const float *src_y0 = proposals + 1 * num_proposals; - const float *src_x1 = proposals + 2 * num_proposals; - const float *src_y1 = proposals + 3 * num_proposals; - const float *src_score = proposals + 4 * num_proposals; +void fill_output_blobs(const float* proposals, + const int* roi_indices, + float* rois, + float* scores, + const int num_proposals, + const int num_rois, + const int post_nms_topn) { + const float* src_x0 = proposals + 0 * num_proposals; + const float* src_y0 = proposals + 1 * num_proposals; + const float* src_x1 = proposals + 2 * num_proposals; + const float* src_y1 = proposals + 3 * num_proposals; + const float* src_score = proposals + 4 * num_proposals; parallel_for(num_rois, [&](size_t i) { int index = roi_indices[i]; @@ -269,10 +287,11 @@ void fill_output_blobs(const float* proposals, const int* roi_indices, } // namespace -bool ExperimentalDetectronGenerateProposalsSingleImage::isSupportedOperation - (const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExperimentalDetectronGenerateProposalsSingleImage::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { - const auto proposalOp = ov::as_type_ptr(op); + const auto proposalOp = + ov::as_type_ptr(op); if (!proposalOp) { errorMessage = "Node is not an instance of the Proposal from the operations set v0."; return false; @@ -313,8 +332,7 @@ void ExperimentalDetectronGenerateProposalsSingleImage::initSupportedPrimitiveDe {LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, - {{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + {{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -325,13 +343,13 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str } size_t anchor_dims_size = 1; - const auto &anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); + const auto& anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); for (size_t i = 0; i < anchorDims.size(); i++) { anchor_dims_size *= anchorDims[i]; } size_t deltas_dims_size = 1; - const auto &deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); + const auto& deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); for (size_t i = 0; i < deltaDims.size(); i++) { deltas_dims_size *= deltaDims[i]; } @@ -339,7 +357,7 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str OPENVINO_THROW("'Anchors' blob size for ONNXProposal is incompatible with 'deltas' blob size!"); size_t score_dims_size = 1; - const auto &scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); + const auto& scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); for (size_t i = 0; i < scoreDims.size(); i++) { score_dims_size *= scoreDims[i]; } @@ -347,13 +365,13 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str OPENVINO_THROW("'Deltas' blob size for ONNXProposal is incompatible with 'scores' blob size!"); // Prepare memory - const float *p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); - const float *p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); - const float *p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); - const float *p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); + const float* p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); + const float* p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); + const float* p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); + const float* p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); - float *p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); - float *p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); + float* p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); + float* p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); const int anchors_num = scoreDims[0]; @@ -398,24 +416,45 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str // Execute int batch_size = 1; // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0]; for (int n = 0; n < batch_size; ++n) { - refine_anchors(p_deltas_item, p_scores_item, p_anchors_item, - reinterpret_cast(&proposals_[0]), anchors_num, bottom_H, - bottom_W, img_H, img_W, - min_box_H, min_box_W, + refine_anchors(p_deltas_item, + p_scores_item, + p_anchors_item, + reinterpret_cast(&proposals_[0]), + anchors_num, + bottom_H, + bottom_W, + img_H, + img_W, + min_box_H, + min_box_W, static_cast(std::log(1000. / 16.)), 1.0f); - std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), - [](const ProposalBox &struct1, const ProposalBox &struct2) { + std::partial_sort(proposals_.begin(), + proposals_.begin() + pre_nms_topn, + proposals_.end(), + [](const ProposalBox& struct1, const ProposalBox& struct2) { return (struct1.score > struct2.score); }); - unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); - nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, - nms_thresh_, post_nms_topn_, coordinates_offset); - fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item, - pre_nms_topn, num_rois, post_nms_topn_); + unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); + nms_cpu(pre_nms_topn, + &is_dead[0], + &unpacked_boxes[0], + &roi_indices_[0], + &num_rois, + 0, + nms_thresh_, + post_nms_topn_, + coordinates_offset); + fill_output_blobs(&unpacked_boxes[0], + &roi_indices_[0], + p_roi_item, + p_roi_score_item, + pre_nms_topn, + num_rois, + post_nms_topn_); } - } catch (const std::exception &e) { + } catch (const std::exception& e) { std::string errorMsg = e.what(); OPENVINO_THROW(errorMsg); } @@ -433,6 +472,6 @@ bool ExperimentalDetectronGenerateProposalsSingleImage::needPrepareParams() cons return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h index 41aaf63f637e76..d747813e10b258 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h @@ -13,16 +13,18 @@ namespace node { class ExperimentalDetectronGenerateProposalsSingleImage : public Node { public: ExperimentalDetectronGenerateProposalsSingleImage(const std::shared_ptr& op, - const GraphContext::CPtr context); + const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; bool needShapeInfer() const override; bool needPrepareParams() const override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: @@ -32,12 +34,12 @@ class ExperimentalDetectronGenerateProposalsSingleImage : public Node { // Outputs: // top_rois, shape [max_rois, 4] - const int INPUT_IM_INFO {0}; - const int INPUT_ANCHORS {1}; - const int INPUT_DELTAS {2}; - const int INPUT_SCORES {3}; - const int OUTPUT_ROIS {0}; - const int OUTPUT_SCORES {1}; + const int INPUT_IM_INFO{0}; + const int INPUT_ANCHORS{1}; + const int INPUT_DELTAS{2}; + const int INPUT_SCORES{3}; + const int OUTPUT_ROIS{0}; + const int OUTPUT_SCORES{1}; float min_size_; int pre_nms_topn_; @@ -48,6 +50,6 @@ class ExperimentalDetectronGenerateProposalsSingleImage : public Node { std::vector roi_indices_; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp index eead95def0a8fb..f7df0e533778ed 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp @@ -2,20 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "experimental_detectron_priorgridgenerator.h" #include +#include + #include "openvino/core/parallel.hpp" -#include "experimental_detectron_priorgridgenerator.h" namespace ov { namespace intel_cpu { namespace node { bool ExperimentalDetectronPriorGridGenerator::isSupportedOperation(const std::shared_ptr& op, - std::string& errorMessage) noexcept { + std::string& errorMessage) noexcept { try { - const auto priorGridGen = std::dynamic_pointer_cast(op); + const auto priorGridGen = + std::dynamic_pointer_cast(op); if (!priorGridGen) { errorMessage = "Only opset6 ExperimentalDetectronPriorGridGenerator operation is supported"; return false; @@ -39,7 +41,7 @@ ExperimentalDetectronPriorGridGenerator::ExperimentalDetectronPriorGridGenerator if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1) OPENVINO_THROW(errorPrefix, " has incorrect number of input/output edges!"); - const auto &attr = priorGridGen->get_attrs(); + const auto& attr = priorGridGen->get_attrs(); grid_w_ = attr.w; grid_h_ = attr.h; stride_h_ = attr.stride_y; @@ -64,11 +66,15 @@ void ExperimentalDetectronPriorGridGenerator::execute(dnnl::stream strm) { // Execute const int layer_width = grid_w_ ? grid_w_ : getParentEdgeAt(INPUT_FEATUREMAP)->getMemory().getStaticDims()[3]; const int layer_height = grid_h_ ? grid_h_ : getParentEdgeAt(INPUT_FEATUREMAP)->getMemory().getStaticDims()[2]; - const float step_w = stride_w_ ? stride_w_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[3]) / layer_width; - const float step_h = stride_h_ ? stride_h_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[2]) / layer_height; + const float step_w = + stride_w_ ? stride_w_ + : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[3]) / layer_width; + const float step_h = + stride_h_ ? stride_h_ + : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[2]) / layer_height; - const auto *bottom_data_0 = getSrcDataAtPortAs(0); - auto *top_data_0 = getDstDataAtPortAs(OUTPUT_ROIS); + const auto* bottom_data_0 = getSrcDataAtPortAs(0); + auto* top_data_0 = getDstDataAtPortAs(OUTPUT_ROIS); for (int h = 0; h < layer_height; ++h) { for (int w = 0; w < layer_width; ++w) { @@ -91,6 +97,6 @@ bool ExperimentalDetectronPriorGridGenerator::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h index cf52b4e5c9b934..47c2c16dc558b9 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h @@ -14,13 +14,15 @@ class ExperimentalDetectronPriorGridGenerator : public Node { public: ExperimentalDetectronPriorGridGenerator(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; bool needPrepareParams() const override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: @@ -31,11 +33,11 @@ class ExperimentalDetectronPriorGridGenerator : public Node { // Outputs: // priors_grid, shape [m, 4] - const int INPUT_PRIORS {0}; - const int INPUT_FEATUREMAP {1}; - const int INPUT_IMAGE {2}; + const int INPUT_PRIORS{0}; + const int INPUT_FEATUREMAP{1}; + const int INPUT_IMAGE{2}; - const int OUTPUT_ROIS {0}; + const int OUTPUT_ROIS{0}; int grid_w_; int grid_h_; @@ -45,6 +47,6 @@ class ExperimentalDetectronPriorGridGenerator : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp index c92e3c2594d4a9..05f2202537f986 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp @@ -2,14 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "experimental_detectron_roifeatureextractor.h" + +#include +#include #include #include -#include -#include -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" -#include "experimental_detectron_roifeatureextractor.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -30,31 +31,28 @@ struct PreCalc { }; template -void pre_calc_for_bilinear_interpolate( - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int iy_upper, - const int ix_upper, - T roi_start_h, - T roi_start_w, - T bin_size_h, - T bin_size_w, - int roi_bin_grid_h, - int roi_bin_grid_w, - std::vector>& pre_calc) { +void pre_calc_for_bilinear_interpolate(const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T x = xx; T y = yy; @@ -126,19 +124,18 @@ void pre_calc_for_bilinear_interpolate( } template -void ROIAlignForward_cpu_kernel( - const int nthreads, - const T* bottom_data, - const T& spatial_scale, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int sampling_ratio, - const T* bottom_rois, - const bool aligned, - T* top_data) { +void ROIAlignForward_cpu_kernel(const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + const bool aligned, + T* top_data) { int roi_cols = 4; int n_rois = nthreads / channels / pooled_width / pooled_height; @@ -168,38 +165,33 @@ void ROIAlignForward_cpu_kernel( T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : static_cast(ceil(roi_height / pooled_height)); // e.g., = 2 - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_width / pooled_width)); + int roi_bin_grid_h = + (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_height / pooled_height)); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_width / pooled_width)); // We do average (integral) pooling inside a bin const T count = static_cast(roi_bin_grid_h * roi_bin_grid_w); // e.g. = 4 // we want to precalculate indices and weights shared by all chanels, // this is the key point of optimiation - std::vector> pre_calc( - roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); - pre_calc_for_bilinear_interpolate( - height, - width, - pooled_height, - pooled_width, - roi_bin_grid_h, - roi_bin_grid_w, - roi_start_h, - roi_start_w, - bin_size_h, - bin_size_w, - roi_bin_grid_h, - roi_bin_grid_w, - pre_calc); + std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate(height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); for (int c = 0; c < channels; c++) { int index_n_c = index_n + c * pooled_width * pooled_height; - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; + const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { @@ -210,10 +202,8 @@ void ROIAlignForward_cpu_kernel( for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc pc = pre_calc[pre_calc_index]; - output_val += pc.w1 * offset_bottom_data[pc.pos1] + - pc.w2 * offset_bottom_data[pc.pos2] + - pc.w3 * offset_bottom_data[pc.pos3] + - pc.w4 * offset_bottom_data[pc.pos4]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4]; pre_calc_index += 1; } @@ -222,14 +212,12 @@ void ROIAlignForward_cpu_kernel( top_data[index] = output_val; } // for pw - } // for ph - } // for c + } // for ph + } // for c }); } - -void redistribute_rois(const float* rois, int* level_ids, - const int num_rois, const int levels_num) { +void redistribute_rois(const float* rois, int* level_ids, const int num_rois, const int levels_num) { const float canonical_scale = 224.0f; const int canonical_level = 2; @@ -252,11 +240,11 @@ void redistribute_rois(const float* rois, int* level_ids, } } - -void reord(const float* src_data, const int* ranks, const int n, const int step, float* dst_data, - int* dst_mapping) { +void reord(const float* src_data, const int* ranks, const int n, const int step, float* dst_data, int* dst_mapping) { std::iota(dst_mapping, dst_mapping + n, 0); - std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];}); + std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) { + return ranks[i1] < ranks[i2]; + }); for (int i = 0; i < n; ++i) { const int j = dst_mapping[i]; assert(0 <= j && j < n); @@ -277,12 +265,13 @@ void split_points(const std::vector& ids, std::vector& rois_per_level, rois_per_level.insert(rois_per_level.begin(), 0); } -} // namespace +} // namespace bool ExperimentalDetectronROIFeatureExtractor::isSupportedOperation(const std::shared_ptr& op, - std::string& errorMessage) noexcept { + std::string& errorMessage) noexcept { try { - const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); + const auto roiFeatureExtractor = + std::dynamic_pointer_cast(op); if (!roiFeatureExtractor) { errorMessage = "Only opset6 ExperimentalDetectronROIFeatureExtractor operation is supported"; return false; @@ -301,8 +290,9 @@ ExperimentalDetectronROIFeatureExtractor::ExperimentalDetectronROIFeatureExtract OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } - const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); - const auto &attr = roiFeatureExtractor->get_attrs(); + const auto roiFeatureExtractor = + std::dynamic_pointer_cast(op); + const auto& attr = roiFeatureExtractor->get_attrs(); output_dim_ = attr.output_size; pyramid_scales_ = attr.pyramid_scales; sampling_ratio_ = attr.sampling_ratio; @@ -321,8 +311,7 @@ void ExperimentalDetectronROIFeatureExtractor::initSupportedPrimitiveDescriptors inDataConf.emplace_back(LayoutType::ncsp, ov::element::f32); addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + {{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -332,15 +321,15 @@ void ExperimentalDetectronROIFeatureExtractor::execute(dnnl::stream strm) { const int channels_num = getParentEdgeAt(INPUT_FEATURES_START)->getMemory().getStaticDims()[1]; const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num; - auto *input_rois = getSrcDataAtPortAs(INPUT_ROIS); - auto *output_rois_features = getDstDataAtPortAs(OUTPUT_ROI_FEATURES); - float *output_rois = nullptr; + auto* input_rois = getSrcDataAtPortAs(INPUT_ROIS); + auto* output_rois_features = getDstDataAtPortAs(OUTPUT_ROI_FEATURES); + float* output_rois = nullptr; if (OUTPUT_ROIS < outputShapes.size()) { output_rois = getDstDataAtPortAs(OUTPUT_ROIS); } std::vector level_ids(num_rois, 0); - redistribute_rois(input_rois, reinterpret_cast(&level_ids[0]), num_rois, levels_num); + redistribute_rois(input_rois, reinterpret_cast(&level_ids[0]), num_rois, levels_num); std::vector reordered_rois(4 * num_rois, 0); std::vector original_rois_mapping(num_rois, 0); @@ -354,7 +343,7 @@ void ExperimentalDetectronROIFeatureExtractor::execute(dnnl::stream strm) { const int level_rois_offset = rois_per_level[i]; const int level_rois_num = rois_per_level[i + 1] - level_rois_offset; if (level_rois_num > 0) { - auto *featuremap = getSrcDataAtPortAs(INPUT_FEATURES_START + i); + auto* featuremap = getSrcDataAtPortAs(INPUT_FEATURES_START + i); const int featuremap_height = getParentEdgeAt(INPUT_FEATURES_START + i)->getMemory().getStaticDims()[2]; const int featuremap_width = getParentEdgeAt(INPUT_FEATURES_START + i)->getMemory().getStaticDims()[3]; ROIAlignForward_cpu_kernel(feaxels_per_roi * level_rois_num, @@ -373,8 +362,12 @@ void ExperimentalDetectronROIFeatureExtractor::execute(dnnl::stream strm) { } std::vector dummy_mapping(num_rois, 0); - reord(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi, - output_rois_features, &dummy_mapping[0]); + reord(&output_rois_features_temp[0], + &original_rois_mapping[0], + num_rois, + feaxels_per_roi, + output_rois_features, + &dummy_mapping[0]); if (output_rois != nullptr) { cpu_memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float)); } @@ -384,6 +377,6 @@ bool ExperimentalDetectronROIFeatureExtractor::created() const { return getType() == Type::ExperimentalDetectronROIFeatureExtractor; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h index 94bfdfd224d0c5..374fd62c61b776 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h @@ -14,22 +14,26 @@ class ExperimentalDetectronROIFeatureExtractor : public Node { public: ExperimentalDetectronROIFeatureExtractor(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override { return false; }; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); }; + bool needPrepareParams() const override { + return false; + }; + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + }; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - const int INPUT_ROIS {0}; - const int INPUT_FEATURES_START {1}; + const int INPUT_ROIS{0}; + const int INPUT_FEATURES_START{1}; - const int OUTPUT_ROI_FEATURES {0}; - const size_t OUTPUT_ROIS {1}; + const int OUTPUT_ROI_FEATURES{0}; + const size_t OUTPUT_ROIS{1}; int output_dim_ = 0; int pooled_height_ = 0; @@ -39,6 +43,6 @@ class ExperimentalDetectronROIFeatureExtractor : public Node { bool aligned_ = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp index 46b60fcdb83efd..f09d96ac7a7f7e 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp @@ -2,20 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "experimental_detectron_topkrois.h" + +#include +#include #include #include -#include -#include -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" -#include "experimental_detectron_topkrois.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { namespace node { -bool ExperimentalDetectronTopKROIs::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExperimentalDetectronTopKROIs::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto topKROI = std::dynamic_pointer_cast(op); if (!topKROI) { @@ -56,8 +58,7 @@ void ExperimentalDetectronTopKROIs::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -66,14 +67,16 @@ void ExperimentalDetectronTopKROIs::execute(dnnl::stream strm) { const int input_rois_num = getParentEdgeAt(INPUT_ROIS)->getMemory().getStaticDims()[0]; const int top_rois_num = (std::min)(max_rois_num_, input_rois_num); - auto *input_rois = getSrcDataAtPortAs(INPUT_ROIS); - auto *input_probs = getSrcDataAtPortAs(INPUT_PROBS); - auto *output_rois = getDstDataAtPortAs(OUTPUT_ROIS); + auto* input_rois = getSrcDataAtPortAs(INPUT_ROIS); + auto* input_probs = getSrcDataAtPortAs(INPUT_PROBS); + auto* output_rois = getDstDataAtPortAs(OUTPUT_ROIS); std::vector idx(input_rois_num); iota(idx.begin(), idx.end(), 0); // FIXME. partial_sort is enough here. - sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) {return input_probs[i1] > input_probs[i2];}); + sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) { + return input_probs[i1] > input_probs[i2]; + }); for (int i = 0; i < top_rois_num; ++i) { cpu_memcpy(output_rois + 4 * i, input_rois + 4 * idx[i], 4 * sizeof(float)); @@ -84,6 +87,6 @@ bool ExperimentalDetectronTopKROIs::created() const { return getType() == Type::ExperimentalDetectronTopKROIs; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h index 5c2db1fa2303ea..3fe134948d5e45 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h @@ -14,14 +14,20 @@ class ExperimentalDetectronTopKROIs : public Node { public: ExperimentalDetectronTopKROIs(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needShapeInfer() const override { return false; }; - bool needPrepareParams() const override { return false; }; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); }; + bool needShapeInfer() const override { + return false; + }; + bool needPrepareParams() const override { + return false; + }; + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + }; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -32,15 +38,15 @@ class ExperimentalDetectronTopKROIs : public Node { // Outputs: // top_rois, shape [max_rois, 4] - const int INPUT_ROIS {0}; - const int INPUT_PROBS {1}; + const int INPUT_ROIS{0}; + const int INPUT_PROBS{1}; - const int OUTPUT_ROIS {0}; + const int OUTPUT_ROIS{0}; int max_rois_num_; std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp b/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp index 8b5d0b510614e1..51ae2123bbd382 100644 --- a/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp +++ b/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp @@ -3,15 +3,16 @@ // #include "extract_image_patches.h" -#include "common/primitive_hashing_utils.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "openvino/core/parallel.hpp" -#include "openvino/opsets/opset3.hpp" #include #include #include +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/opsets/opset3.hpp" + using namespace dnnl::impl::cpu; using namespace dnnl::impl::cpu::x64; using namespace dnnl::impl::utils; @@ -21,13 +22,15 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_extract_image_patches_args, field) +# define GET_OFF(field) offsetof(jit_extract_image_patches_args, field) template struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_extract_image_patches_kernel) - explicit jit_extract_image_patches_kernel(jit_extract_image_patches_params jpp) : jit_uni_extract_image_patches_kernel(jpp), jit_generator(jit_name()) {} + explicit jit_extract_image_patches_kernel(jit_extract_image_patches_params jpp) + : jit_uni_extract_image_patches_kernel(jpp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -92,35 +95,47 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k Vmm vmm = Vmm(0); Xmm xmm = Xmm(0); - Vmm vmm_zero = Vmm(1); // reserved for pad + Vmm vmm_zero = Vmm(1); // reserved for pad Xbyak::Xmm xmm_aux = Xbyak::Xmm(2); Vmm vmm_gather_index = Vmm(3); Vmm vmm_gather_mask = Vmm(4); Opmask k_mask = Xbyak::Opmask(1); Xbyak::Label gather_index_table; - inline void load_scalar(Vmm vmm_arg, const Xbyak::Address &op) { + inline void load_scalar(Vmm vmm_arg, const Xbyak::Address& op) { Xbyak::Xmm xmm_src = Xmm(vmm_arg.getIdx()); switch (jpp.dtype_size) { - case 4: uni_vmovss(vmm_arg, op); break; - case 2: uni_vpinsrw(xmm_src, xmm_src, op, 0x0); break; - case 1: uni_vpinsrb(xmm_src, xmm_src, op, 0x0); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + uni_vmovss(vmm_arg, op); + break; + case 2: + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); + break; + case 1: + uni_vpinsrb(xmm_src, xmm_src, op, 0x0); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } - inline void store_scalar(const Xbyak::Address &op, Vmm vmm_arg) { + inline void store_scalar(const Xbyak::Address& op, Vmm vmm_arg) { Xbyak::Xmm xmm_dst = Xmm(vmm_arg.getIdx()); switch (jpp.dtype_size) { - case 4: uni_vmovss(op, vmm_arg); break; - case 2: uni_vpextrw(op, xmm_dst, 0x0); break; - case 1: uni_vpextrb(op, xmm_dst, 0x0); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + uni_vmovss(op, vmm_arg); + break; + case 2: + uni_vpextrw(op, xmm_dst, 0x0); + break; + case 1: + uni_vpextrb(op, xmm_dst, 0x0); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } - inline void pad_with_zeros(reg64_t ®_num_pads_arg, reg64_t ®_dst_arg) { + inline void pad_with_zeros(reg64_t& reg_num_pads_arg, reg64_t& reg_dst_arg) { Xbyak::Label main, tail, exit; L(main); { @@ -143,57 +158,67 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k L(exit); } - inline void custom_uni_vgatherdps(const Vmm &vmm_arg, reg64_t &mem_base, const Vmm &mem_offset, Vmm &vmm_mask) { + inline void custom_uni_vgatherdps(const Vmm& vmm_arg, reg64_t& mem_base, const Vmm& mem_offset, Vmm& vmm_mask) { switch (isa) { - case x64::avx2: - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_arg, ptr[mem_base + mem_offset], vmm_mask); - break; - case x64::avx512_core: - kxnord(k_mask, k_mask, k_mask); - vgatherdps(vmm_arg | k_mask, ptr[mem_base + mem_offset]); - break; - case x64::sse41: - emulate_gather(vmm_arg, mem_base); - break; - default: - OPENVINO_THROW("Got unsupported instruction set."); + case x64::avx2: + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_arg, ptr[mem_base + mem_offset], vmm_mask); + break; + case x64::avx512_core: + kxnord(k_mask, k_mask, k_mask); + vgatherdps(vmm_arg | k_mask, ptr[mem_base + mem_offset]); + break; + case x64::sse41: + emulate_gather(vmm_arg, mem_base); + break; + default: + OPENVINO_THROW("Got unsupported instruction set."); } } - inline void gather_src2vmm(const Vmm &vmm_arg, reg64_t &mem_base) { + inline void gather_src2vmm(const Vmm& vmm_arg, reg64_t& mem_base) { switch (jpp.dtype_size) { - case 4: custom_uni_vgatherdps(vmm, mem_base, vmm_gather_index, vmm_gather_mask); break; - case 2: - case 1: emulate_gather(vmm_arg, mem_base); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + custom_uni_vgatherdps(vmm, mem_base, vmm_gather_index, vmm_gather_mask); + break; + case 2: + case 1: + emulate_gather(vmm_arg, mem_base); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } - inline void emulate_gather(const Xbyak::Xmm &xmm_arg, reg64_t &mem_base, int xmm_offset = 0) { - const int xmm_size = 16; // bytes + inline void emulate_gather(const Xbyak::Xmm& xmm_arg, reg64_t& mem_base, int xmm_offset = 0) { + const int xmm_size = 16; // bytes const int xmm_block_size = xmm_size / jpp.dtype_size; const int offset = xmm_offset * jpp.SW * jpp.dtype_size * xmm_block_size; for (int i = 0; i < xmm_block_size; i++) { Xbyak::Address addr = ptr[mem_base + i * jpp.SW * jpp.dtype_size + offset]; switch (jpp.dtype_size) { - case 4: uni_vpinsrd(xmm_arg, xmm_arg, addr, i); break; - case 2: uni_vpinsrw(xmm_arg, xmm_arg, addr, i); break; - case 1: uni_vpinsrb(xmm_arg, xmm_arg, addr, i); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + uni_vpinsrd(xmm_arg, xmm_arg, addr, i); + break; + case 2: + uni_vpinsrw(xmm_arg, xmm_arg, addr, i); + break; + case 1: + uni_vpinsrb(xmm_arg, xmm_arg, addr, i); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } } - inline void emulate_gather(const Xbyak::Ymm &ymm_arg, reg64_t &mem_base) { + inline void emulate_gather(const Xbyak::Ymm& ymm_arg, reg64_t& mem_base) { Xbyak::Xmm low_xmm = Xbyak::Xmm(ymm_arg.getIdx()); emulate_gather(low_xmm, mem_base, 0); emulate_gather(xmm_aux, mem_base, 1); vinserti128(ymm_arg, ymm_arg, xmm_aux, 1); } - inline void emulate_gather(const Xbyak::Zmm &zmm_arg, reg64_t &mem_base) { + inline void emulate_gather(const Xbyak::Zmm& zmm_arg, reg64_t& mem_base) { Xbyak::Xmm low_xmm = Xbyak::Xmm(zmm_arg.getIdx()); emulate_gather(low_xmm, mem_base, 0); for (int i = 1; i < 4; i++) { @@ -270,9 +295,10 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k dd(i * jpp.SW * jpp.dtype_size); } }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 -bool ExtractImagePatches::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExtractImagePatches::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { auto extImgPatcher = ov::as_type_ptr(op); if (!extImgPatcher) { @@ -284,7 +310,10 @@ bool ExtractImagePatches::isSupportedOperation(const std::shared_ptrget_sizes().size(), extImgPatcher->get_strides().size(), extImgPatcher->get_rates().size())) { + if (!everyone_is(2u, + extImgPatcher->get_sizes().size(), + extImgPatcher->get_strides().size(), + extImgPatcher->get_rates().size())) { errorMessage = "Doesn't support 'sizes', 'strides', 'rates', attributes with rank != 2"; return false; } @@ -323,7 +352,7 @@ size_t ExtractImagePatchesKey::hash() const { bool ExtractImagePatchesKey::operator==(const ExtractImagePatchesKey& rhs) const { bool result = inDims == rhs.inDims && outDims == rhs.outDims && kSizes == rhs.kSizes && strides == rhs.strides && - rates == rhs.rates && padType == rhs.padType && prcSize == rhs.prcSize; + rates == rhs.rates && padType == rhs.padType && prcSize == rhs.prcSize; return result; } } // namespace @@ -362,7 +391,8 @@ ExtractImagePatches::ExtractImagePatches(const std::shared_ptr& op, co OPENVINO_THROW(errorPrefix, "has unsupported pad type: ", extImgPatcher->get_auto_pad()); } - _ksizes = extImgPatcher->get_sizes();; + _ksizes = extImgPatcher->get_sizes(); + ; _strides = extImgPatcher->get_strides(); _rates = extImgPatcher->get_rates(); if (_ksizes.size() != 2 || _strides.size() != 2 || _rates.size() != 2) @@ -416,9 +446,7 @@ void ExtractImagePatches::initSupportedPrimitiveDescriptors() { if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end()) OPENVINO_THROW(errorPrefix, "has unsupported precision: ", precision.get_type_name()); - addSupportedPrimDesc({{LayoutType::ncsp, precision}}, - {{LayoutType::ncsp, precision}}, - impl_desc_type::ref_any); + addSupportedPrimDesc({{LayoutType::ncsp, precision}}, {{LayoutType::ncsp, precision}}, impl_desc_type::ref_any); } void ExtractImagePatches::execute(dnnl::stream strm) { @@ -437,12 +465,17 @@ void ExtractImagePatches::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const { +void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) const { const char* src_data = reinterpret_cast(src); char* dst_data = reinterpret_cast(dst); - const std::vector ostrides_partial = { ostrides[0], jpp.KW * IC * ostrides[1], IC * ostrides[1], ostrides[1] }; + const std::vector ostrides_partial = {ostrides[0], + jpp.KW * IC * ostrides[1], + IC * ostrides[1], + ostrides[1]}; parallel_for4d(OB, jpp.KH, jpp.KW, IC, [&](const size_t ob, const size_t kh, const size_t kw, const size_t ic) { const int64_t iw_start = static_cast(kw * RW) - PL; @@ -450,12 +483,17 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( const size_t ih_lpad = ih_start >= 0 ? 0 : std::ceil(-1.f * ih_start / jpp.SH); const size_t iw_lpad = iw_start >= 0 ? 0 : std::ceil(-1.f * iw_start / jpp.SW); - const size_t ih_hpad = std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH + -1.f * ih_start) / jpp.SH); - const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW ? jpp.OW : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); + const size_t ih_hpad = + std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH + -1.f * ih_start) / jpp.SH); + const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW + ? jpp.OW + : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); - char* my_dst_ptr = dst_data + - (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * jpp.dtype_size; - const char* my_src_ptr = src_data + (ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start) * jpp.dtype_size; + char* my_dst_ptr = dst_data + (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + + ic * ostrides_partial[3]) * + jpp.dtype_size; + const char* my_src_ptr = + src_data + (ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start) * jpp.dtype_size; size_t num_bytes_to_set = ih_lpad * jpp.OW * jpp.dtype_size; memset(my_dst_ptr, 0, num_bytes_to_set); @@ -463,14 +501,15 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( const char* src_ptr_h_stop = my_src_ptr + ih_hpad * jpp.SH * jpp.IW * jpp.dtype_size; for (const char* src_h_ptr = my_src_ptr + ih_lpad * jpp.SH * jpp.IW * jpp.dtype_size; - src_h_ptr < src_ptr_h_stop; src_h_ptr += jpp.SH * jpp.IW * jpp.dtype_size) { + src_h_ptr < src_ptr_h_stop; + src_h_ptr += jpp.SH * jpp.IW * jpp.dtype_size) { num_bytes_to_set = iw_lpad * jpp.dtype_size; memset(my_dst_ptr, 0, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; const char* src_ptr_w_stop = src_h_ptr + iw_hpad * jpp.SW * jpp.dtype_size; - for (const char* src_w_ptr = src_h_ptr + iw_lpad * jpp.SW * jpp.dtype_size; - src_w_ptr < src_ptr_w_stop; src_w_ptr += jpp.SW * jpp.dtype_size) { + for (const char* src_w_ptr = src_h_ptr + iw_lpad * jpp.SW * jpp.dtype_size; src_w_ptr < src_ptr_w_stop; + src_w_ptr += jpp.SW * jpp.dtype_size) { num_bytes_to_set = jpp.dtype_size; memcpy(my_dst_ptr, src_w_ptr, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; @@ -484,25 +523,35 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( }); } -void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneric( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const { +void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneric(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) const { #if defined(OPENVINO_ARCH_X86_64) const char* src_data = reinterpret_cast(src); char* dst_data = reinterpret_cast(dst); const auto& jpp = pKernel->jpp; - const std::vector ostrides_partial = { ostrides[0], jpp.KW * IC * ostrides[1], IC * ostrides[1], ostrides[1] }; + const std::vector ostrides_partial = {ostrides[0], + jpp.KW * IC * ostrides[1], + IC * ostrides[1], + ostrides[1]}; parallel_for4d(OB, jpp.KH, jpp.KW, IC, [&](const size_t ob, const size_t kh, const size_t kw, const size_t ic) { const int64_t ih_start = kh * RH - PT; const int64_t iw_start = kw * RW - PL; const size_t ih_lpad = ih_start >= 0 ? 0 : std::ceil(-1.f * ih_start / jpp.SH); const size_t iw_lpad = iw_start >= 0 ? 0 : std::ceil(-1.f * iw_start / jpp.SW); - const size_t ih_hpad = std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH - 1.f * ih_start) / jpp.SH); - const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW ? jpp.OW : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); + const size_t ih_hpad = + std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH - 1.f * ih_start) / jpp.SH); + const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW + ? jpp.OW + : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); - size_t dst_offset = ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]; - size_t src_offset = ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start + ih_lpad * jpp.SH * jpp.IW; + size_t dst_offset = + ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]; + size_t src_offset = + ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start + ih_lpad * jpp.SH * jpp.IW; auto args = jit_extract_image_patches_args(); args.src = src_data + src_offset * jpp.dtype_size; @@ -513,7 +562,7 @@ void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneri args.w_hi_pad = iw_hpad; (*pKernel)(&args); }); -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 } jit_extract_image_patches_params ExtractImagePatches::ExtractImagePatchesExecutor::fillJpp( @@ -584,14 +633,13 @@ jit_extract_image_patches_params ExtractImagePatches::ExtractImagePatchesExecuto return jpp; } -ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize) { +ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize) { #if defined(OPENVINO_ARCH_X86_64) auto jpp = fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize); if (mayiuse(x64::avx512_core)) { @@ -606,27 +654,31 @@ ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecu if (pKernel) pKernel->create_ker(); -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 } -void ExtractImagePatches::ExtractImagePatchesJitExecutor::exec( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) { +void ExtractImagePatches::ExtractImagePatchesJitExecutor::exec(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) { if (!pKernel) OPENVINO_THROW("Can't execute, kernel for extract image patches node is not compiled"); executeOptimizedGeneric(src, dst, istrides, ostrides); } -ExtractImagePatches::ExtractImagePatchesRefExecutor::ExtractImagePatchesRefExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize) : jpp(fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize)) {} - -void ExtractImagePatches::ExtractImagePatchesRefExecutor::exec( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) { +ExtractImagePatches::ExtractImagePatchesRefExecutor::ExtractImagePatchesRefExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize) + : jpp(fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize)) {} + +void ExtractImagePatches::ExtractImagePatchesRefExecutor::exec(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) { executeReference(src, dst, istrides, ostrides); } @@ -636,6 +688,6 @@ bool ExtractImagePatches::created() const { return getType() == Type::ExtractImagePatches; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/extract_image_patches.h b/src/plugins/intel_cpu/src/nodes/extract_image_patches.h index 15220fd51a4285..1844b5cafeeb07 100644 --- a/src/plugins/intel_cpu/src/nodes/extract_image_patches.h +++ b/src/plugins/intel_cpu/src/nodes/extract_image_patches.h @@ -30,8 +30,11 @@ struct jit_extract_image_patches_args { }; struct jit_uni_extract_image_patches_kernel { - void (*ker_)(const jit_extract_image_patches_args *); - void operator()(const jit_extract_image_patches_args *args) { assert(ker_); ker_(args); } + void (*ker_)(const jit_extract_image_patches_args*); + void operator()(const jit_extract_image_patches_args* args) { + assert(ker_); + ker_(args); + } jit_extract_image_patches_params jpp; virtual void create_ker() = 0; explicit jit_uni_extract_image_patches_kernel(jit_extract_image_patches_params jpp) : ker_(nullptr), jpp(jpp) {} @@ -42,7 +45,7 @@ class ExtractImagePatches : public Node { public: ExtractImagePatches(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -51,11 +54,7 @@ class ExtractImagePatches : public Node { void prepareParams() override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - enum class ExtImgPatcherPadType { - VALID, - SAME_LOWER, - SAME_UPPER - }; + enum class ExtImgPatcherPadType { VALID, SAME_LOWER, SAME_UPPER }; private: std::vector _ksizes; @@ -69,14 +68,13 @@ class ExtractImagePatches : public Node { struct ExtractImagePatchesExecutor { ExtractImagePatchesExecutor() = default; virtual void exec(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) = 0; - jit_extract_image_patches_params fillJpp( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize); + jit_extract_image_patches_params fillJpp(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize); virtual ~ExtractImagePatchesExecutor() = default; protected: @@ -93,30 +91,31 @@ class ExtractImagePatches : public Node { executorPtr execPtr = nullptr; struct ExtractImagePatchesJitExecutor : public ExtractImagePatchesExecutor { - ExtractImagePatchesJitExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize); + ExtractImagePatchesJitExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize); void exec(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) override; - void executeOptimizedGeneric(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const; + void executeOptimizedGeneric(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) const; private: std::unique_ptr pKernel; }; struct ExtractImagePatchesRefExecutor : public ExtractImagePatchesExecutor { - ExtractImagePatchesRefExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize); + ExtractImagePatchesRefExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize); void exec(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) override; void executeReference(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const; @@ -125,6 +124,6 @@ class ExtractImagePatches : public Node { }; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eye.cpp b/src/plugins/intel_cpu/src/nodes/eye.cpp index f1e78b04510914..deb47abdba2dee 100644 --- a/src/plugins/intel_cpu/src/nodes/eye.cpp +++ b/src/plugins/intel_cpu/src/nodes/eye.cpp @@ -3,10 +3,12 @@ // #include "eye.h" -#include "openvino/op/eye.hpp" + #include + #include "openvino/core/parallel.hpp" -#include "shape_inference/shape_inference_ngraph.hpp" +#include "openvino/op/eye.hpp" +#include "shape_inference/shape_inference.hpp" #include "utils/bfloat16.hpp" #define THROW_ERROR(...) OPENVINO_THROW(NameFromType(getType()), " node with name '", getName(), "' ", __VA_ARGS__) @@ -33,28 +35,24 @@ class EyeShapeInferFactory : public ShapeInferFactory { public: EyeShapeInferFactory(std::shared_ptr op) : m_op(op) {} ShapeInferPtr makeShapeInfer() const override { - IShapeInfer::port_mask_t port_mask = EMPTY_PORT_MASK; - if (m_op->get_input_size() == 4) { - port_mask = PortMask(Eye::ROWS_NUM, Eye::COLS_NUM, Eye::DIAGONAL_INDEX, Eye::BATCH_SHAPE); - } else { - port_mask = PortMask(Eye::ROWS_NUM, Eye::COLS_NUM, Eye::DIAGONAL_INDEX); - } - return std::make_shared(make_shape_inference(m_op), port_mask); + return (m_op->get_input_size() == 4) ? make_shape_inference(m_op) + : make_shape_inference(m_op, PortMask(Eye::ROWS_NUM, Eye::COLS_NUM)); } + private: std::shared_ptr m_op; }; -} // namespace +} // namespace -Eye::Eye(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, EyeShapeInferFactory(op)) { +Eye::Eye(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, EyeShapeInferFactory(op)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { - OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } outType = op->get_output_element_type(0); withBatchShape = (op->get_input_size() == 4); - if (!one_of(outType, ov::element::f32, ov::element::bf16, - ov::element::i32, ov::element::i8, ov::element::u8)) { + if (!one_of(outType, ov::element::f32, ov::element::bf16, ov::element::i32, ov::element::i8, ov::element::u8)) { THROW_ERROR(errorPrefix, "doesn't support demanded output precision"); } } @@ -66,16 +64,19 @@ void Eye::getSupportedDescriptors() { THROW_ERROR(errorPrefix, "has incorrect number of output edges: ", getChildEdges().size()); } -template +template struct Eye::EyeExecute { - void operator()(Eye *node) { + void operator()(Eye* node) { node->executeSpecified(); } }; void Eye::execute(dnnl::stream strm) { auto outputPrec = getChildEdgeAt(0)->getMemory().getDesc().getPrecision(); - OV_SWITCH(intel_cpu, EyeExecute, this, outputPrec, + OV_SWITCH(intel_cpu, + EyeExecute, + this, + outputPrec, OV_CASE(ov::element::f32, float), OV_CASE(ov::element::bf16, bfloat16_t), OV_CASE(ov::element::i32, int), @@ -104,9 +105,9 @@ void Eye::executeSpecified() { const size_t colNum = getColNum(); const int64_t shift = getDiagIndex(); auto outPtr = getDstMemoryAtPort(0); - if (!outPtr || !outPtr ->isDefined()) + if (!outPtr || !outPtr->isDefined()) THROW_ERROR(errorPrefix, "Destination memory is undefined."); - T *dst = outPtr->getDataAs(); + T* dst = outPtr->getDataAs(); const size_t batchVolume = getBatchVolume(getBatchShape()); const size_t spatialCount = colNum * rowNum; @@ -116,8 +117,8 @@ void Eye::executeSpecified() { const int64_t countByColumns = std::max(int64_t(colNum) - std::abs(shift), int64_t(0)); const int64_t countByRows = std::max(int64_t(rowNum) - std::abs(shift), int64_t(0)); - const size_t onesPerBatchNum = - static_cast(shift > 0 ? std::min(countByColumns, int64_t(rowNum)) : std::min(countByRows, int64_t(colNum))); + const size_t onesPerBatchNum = static_cast(shift > 0 ? std::min(countByColumns, int64_t(rowNum)) + : std::min(countByRows, int64_t(colNum))); const size_t dataShift = static_cast(shift >= 0 ? shift : -shift * colNum); if (spatialSize >= l2CacheSize) { @@ -126,7 +127,8 @@ void Eye::executeSpecified() { splitter(elementsCount, nthr, ithr, start, end); memset(dst + start, 0, (end - start) * sizeof(T)); }); - if (onesPerBatchNum == 0) return; + if (onesPerBatchNum == 0) + return; for (size_t bShift = 0; bShift < batchVolume * spatialCount; bShift += spatialCount) { parallel_nt(0, [&](const size_t ithr, const size_t nthr) { size_t start = 0, end = 0; @@ -141,7 +143,8 @@ void Eye::executeSpecified() { size_t start = 0, end = 0; splitter(batchVolume, nthr, ithr, start, end); memset(dst + start * spatialCount, 0, (end - start) * spatialSize); - if (onesPerBatchNum == 0) return; + if (onesPerBatchNum == 0) + return; for (size_t spShift = start * spatialCount; spShift < end * spatialCount; spShift += spatialCount) { for (size_t j = 0; j < onesPerBatchNum; j++) { dst[dataShift + j * (colNum + 1) + spShift] = static_cast(1); @@ -154,6 +157,6 @@ void Eye::executeSpecified() { bool Eye::created() const { return getType() == Type::Eye; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eye.h b/src/plugins/intel_cpu/src/nodes/eye.h index 7978c45d8a05d1..fc2b42a18bdbe9 100644 --- a/src/plugins/intel_cpu/src/nodes/eye.h +++ b/src/plugins/intel_cpu/src/nodes/eye.h @@ -5,9 +5,11 @@ #pragma once #include -#include + #include +#include #include + #include "dnnl_extension_utils.h" namespace ov { @@ -28,9 +30,15 @@ class Eye : public Node { void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override {return false;}; - bool needShapeInfer() const override {return true;}; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + bool needPrepareParams() const override { + return false; + }; + bool needShapeInfer() const override { + return true; + }; + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -39,13 +47,13 @@ class Eye : public Node { ov::element::Type outType = ov::element::Type_t::undefined; template void executeSpecified(); - template + template struct EyeExecute; inline const size_t getRowNum() const { auto rowMem = getSrcMemoryAtPort(ROWS_NUM); if (rowMem == nullptr) OPENVINO_THROW(errorPrefix, " doesn't contain row_count data"); - const int *rowPtr = rowMem->getDataAs(); + const int* rowPtr = rowMem->getDataAs(); return rowPtr[0]; } @@ -53,7 +61,7 @@ class Eye : public Node { auto colMem = getSrcMemoryAtPort(COLS_NUM); if (colMem == nullptr) OPENVINO_THROW(errorPrefix, " doesn't contain col_count data"); - const int *colPtr = colMem->getDataAs(); + const int* colPtr = colMem->getDataAs(); return colPtr[0]; } @@ -61,28 +69,29 @@ class Eye : public Node { auto diagIndMem = getSrcMemoryAtPort(DIAGONAL_INDEX); if (diagIndMem == nullptr) OPENVINO_THROW(errorPrefix, " doesn't contain diag_index data"); - const int *diagIndexPtr = diagIndMem->getDataAs(); + const int* diagIndexPtr = diagIndMem->getDataAs(); return diagIndexPtr[0]; } inline const std::vector getBatchShape() const { if (withBatchShape) { - const int batchShapeSize = static_cast(getSrcMemoryAtPort(BATCH_SHAPE)->getShape().getElementsCount()); + const int batchShapeSize = + static_cast(getSrcMemoryAtPort(BATCH_SHAPE)->getShape().getElementsCount()); std::vector batchShape(batchShapeSize); - const int *batchShapePtr = getSrcDataAtPortAs(BATCH_SHAPE); + const int* batchShapePtr = getSrcDataAtPortAs(BATCH_SHAPE); batchShape.assign(batchShapePtr, batchShapePtr + batchShapeSize); return batchShape; } else { - return std::vector {}; + return std::vector{}; } } - inline const size_t getBatchVolume(const std::vector &batchShape) { + inline const size_t getBatchVolume(const std::vector& batchShape) { return std::accumulate(begin(batchShape), end(batchShape), 1, std::multiplies()); } bool withBatchShape = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp index f12ab40cf5643b..9951c5176f0ad1 100644 --- a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp +++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp @@ -4,28 +4,27 @@ #include "fake_quantize.h" -#include -#include #include +#include + #include -#include #include - -#include "dnnl_types.h" -#include "dnnl_extension_utils.h" -#include "cpu/x64/jit_generator.hpp" #include +#include +#include +#include +#include -#include "openvino/core/parallel.hpp" -#include "utils/general_utils.h" -#include "utils/cpu_utils.hpp" -#include -#include "memory_desc/dnnl_blocked_memory_desc.h" #include "common/cpu_memcpy.h" #include "common/primitive_hashing_utils.hpp" -#include - +#include "cpu/x64/jit_generator.hpp" +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "openvino/core/parallel.hpp" #include "openvino/opsets/opset1.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" // Quantization ranges validation is switched off by default in order to avoid regressions on user side @@ -45,13 +44,15 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_quantize_call_args, field) +# define GET_OFF(field) offsetof(jit_quantize_call_args, field) template struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_binarization_kernel) - explicit jit_uni_binarization_kernel(const jit_quantize_params& jqp) : jit_uni_quantize_kernel(jqp), jit_generator(jit_name()) {} + explicit jit_uni_binarization_kernel(const jit_quantize_params& jqp) + : jit_uni_quantize_kernel(jqp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -77,7 +78,8 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ Label tail_label; Label exit_label; - L(unrolled_loop_label); { + L(unrolled_loop_label); + { int step = isa == cpu::x64::sse41 ? nbits / 2 : isa == cpu::x64::avx2 ? nbits : 2 * nbits; const int ur_ch = isa == cpu::x64::sse41 ? nbits : isa == cpu::x64::avx2 ? nbits / 2 : nbits / 4; const int unrolled_loop_step = ur_ch * step; @@ -87,9 +89,9 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ xor_(reg_bin_32, reg_bin_32); for (int ch = 0; ch < ur_ch; ch++) { - uni_vmovups(vmm_src(0), ptr[reg_from + ch*step*sizeof(float)]); - uni_vmovups(vmm_wei(0), ptr[reg_thresholds + ch*step*sizeof(float)]); - uni_vmovups(vmm_mask(0), ptr[reg_output_mask + ch*step*sizeof(float)]); + uni_vmovups(vmm_src(0), ptr[reg_from + ch * step * sizeof(float)]); + uni_vmovups(vmm_wei(0), ptr[reg_thresholds + ch * step * sizeof(float)]); + uni_vmovups(vmm_mask(0), ptr[reg_output_mask + ch * step * sizeof(float)]); if (isa == avx512_core) { vcmpps(k_mask0, vmm_src(0), vmm_wei(0), _cmp_gt_os); vptestmd(k_mask1, vmm_mask(0), vmm_mask(0)); @@ -105,16 +107,17 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ } mov(ptr[reg_to], reg_bin_32); - add(reg_from, unrolled_loop_step*sizeof(float)); - add(reg_thresholds, unrolled_loop_step*sizeof(float)); - add(reg_output_mask, unrolled_loop_step*sizeof(float)); + add(reg_from, unrolled_loop_step * sizeof(float)); + add(reg_thresholds, unrolled_loop_step * sizeof(float)); + add(reg_output_mask, unrolled_loop_step * sizeof(float)); add(reg_to, sizeof(uint32_t)); sub(reg_work_amount, unrolled_loop_step); jmp(unrolled_loop_label, T_NEAR); } - L(main_loop_label); { + L(main_loop_label); + { int repeats = isa == cpu::x64::sse41 ? 2 : 1; int step = isa == cpu::x64::sse41 ? nbits / 2 : isa == cpu::x64::avx2 ? nbits : nbits * 2; const int main_loop_step = step * repeats; @@ -124,9 +127,9 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ xor_(reg_bin_32, reg_bin_32); for (int i = 0; i < repeats; i++) { - uni_vmovups(vmm_src(0), ptr[reg_from + i*step*sizeof(float)]); - uni_vmovups(vmm_wei(0), ptr[reg_thresholds + i*step*sizeof(float)]); - uni_vmovups(vmm_mask(0), ptr[reg_output_mask + i*step*sizeof(float)]); + uni_vmovups(vmm_src(0), ptr[reg_from + i * step * sizeof(float)]); + uni_vmovups(vmm_wei(0), ptr[reg_thresholds + i * step * sizeof(float)]); + uni_vmovups(vmm_mask(0), ptr[reg_output_mask + i * step * sizeof(float)]); if (isa == avx512_core) { vcmpps(k_mask0, vmm_src(0), vmm_wei(0), _cmp_gt_os); vptestmd(k_mask1, vmm_mask(0), vmm_mask(0)); @@ -145,16 +148,17 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ else mov(ptr[reg_to], reg_bin_8); - add(reg_from, main_loop_step*sizeof(float)); - add(reg_thresholds, main_loop_step*sizeof(float)); - add(reg_output_mask, main_loop_step*sizeof(float)); + add(reg_from, main_loop_step * sizeof(float)); + add(reg_thresholds, main_loop_step * sizeof(float)); + add(reg_output_mask, main_loop_step * sizeof(float)); add(reg_to, isa == avx512_core ? sizeof(uint16_t) : sizeof(uint8_t)); sub(reg_work_amount, main_loop_step); jmp(main_loop_label, T_NEAR); } - L(tail_label); { + L(tail_label); + { if (tail_size != 0) { xor_(reg_bin_32, reg_bin_32); mov(reg_mask, 1); @@ -188,15 +192,27 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; - inline Vmm vmm_src(int idx) { return Vmm(idx); } - inline Xmm xmm_src(int idx) { return Xmm(idx); } - inline Vmm vmm_wei(int idx) { return Vmm(idx + 4); } - inline Vmm vmm_mask(int idx) { return Vmm(idx + 5); } - inline Xmm xmm_wei(int idx) { return Xmm(idx + 4); } - inline Xmm xmm_mask(int idx) { return Xmm(idx + 5); } + inline Vmm vmm_src(int idx) { + return Vmm(idx); + } + inline Xmm xmm_src(int idx) { + return Xmm(idx); + } + inline Vmm vmm_wei(int idx) { + return Vmm(idx + 4); + } + inline Vmm vmm_mask(int idx) { + return Vmm(idx + 5); + } + inline Xmm xmm_wei(int idx) { + return Xmm(idx + 4); + } + inline Xmm xmm_mask(int idx) { + return Xmm(idx + 5); + } Reg64 param = abi_param1; Reg64 reg_from = r8; @@ -219,7 +235,9 @@ template struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_quantization_kernel) - explicit jit_uni_quantization_kernel(const jit_quantize_params& jqp) : jit_uni_quantize_kernel(jqp), jit_generator(jit_name()) {} + explicit jit_uni_quantization_kernel(const jit_quantize_params& jqp) + : jit_uni_quantize_kernel(jqp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -237,37 +255,78 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ else compute_generic(); - this->postamble(); } private: - using Vmm = typename conditional3::type; - - inline Vmm vmm_val(int idx) { return Vmm(idx + 0); } - inline Vmm vmm_crop_low(int idx) { return Vmm(idx + 2); } - inline Vmm vmm_crop_high(int idx) { return Vmm(idx + 4); } - inline Vmm vmm_input_scale(int idx) { return Vmm(idx + 6); } - inline Vmm vmm_input_shift(int idx) { return Vmm(idx + 8); } - inline Vmm vmm_output_scale(int idx) { return Vmm(idx + 10); } - inline Vmm vmm_output_shift(int idx) { return Vmm(idx + 12); } - - inline Ymm ymm_val(int idx) { return Ymm(idx + 0); } - inline Ymm ymm_crop_low(int idx) { return Ymm(idx + 2); } - inline Ymm ymm_crop_high(int idx) { return Ymm(idx + 4); } - inline Ymm ymm_input_scale(int idx) { return Ymm(idx + 6); } - inline Ymm ymm_input_shift(int idx) { return Ymm(idx + 8); } - inline Ymm ymm_output_scale(int idx) { return Ymm(idx + 10); } - inline Ymm ymm_output_shift(int idx) { return Ymm(idx + 12); } - - inline Xmm xmm_val(int idx) { return Xmm(idx + 0); } - inline Xmm xmm_crop_low(int idx) { return Xmm(idx + 2); } - inline Xmm xmm_crop_high(int idx) { return Xmm(idx + 4); } - inline Xmm xmm_input_scale(int idx) { return Xmm(idx + 6); } - inline Xmm xmm_input_shift(int idx) { return Xmm(idx + 8); } - inline Xmm xmm_output_scale(int idx) { return Xmm(idx + 10); } - inline Xmm xmm_output_shift(int idx) { return Xmm(idx + 12); } + using Vmm = + typename conditional3::type; + + inline Vmm vmm_val(int idx) { + return Vmm(idx + 0); + } + inline Vmm vmm_crop_low(int idx) { + return Vmm(idx + 2); + } + inline Vmm vmm_crop_high(int idx) { + return Vmm(idx + 4); + } + inline Vmm vmm_input_scale(int idx) { + return Vmm(idx + 6); + } + inline Vmm vmm_input_shift(int idx) { + return Vmm(idx + 8); + } + inline Vmm vmm_output_scale(int idx) { + return Vmm(idx + 10); + } + inline Vmm vmm_output_shift(int idx) { + return Vmm(idx + 12); + } + + inline Ymm ymm_val(int idx) { + return Ymm(idx + 0); + } + inline Ymm ymm_crop_low(int idx) { + return Ymm(idx + 2); + } + inline Ymm ymm_crop_high(int idx) { + return Ymm(idx + 4); + } + inline Ymm ymm_input_scale(int idx) { + return Ymm(idx + 6); + } + inline Ymm ymm_input_shift(int idx) { + return Ymm(idx + 8); + } + inline Ymm ymm_output_scale(int idx) { + return Ymm(idx + 10); + } + inline Ymm ymm_output_shift(int idx) { + return Ymm(idx + 12); + } + + inline Xmm xmm_val(int idx) { + return Xmm(idx + 0); + } + inline Xmm xmm_crop_low(int idx) { + return Xmm(idx + 2); + } + inline Xmm xmm_crop_high(int idx) { + return Xmm(idx + 4); + } + inline Xmm xmm_input_scale(int idx) { + return Xmm(idx + 6); + } + inline Xmm xmm_input_shift(int idx) { + return Xmm(idx + 8); + } + inline Xmm xmm_output_scale(int idx) { + return Xmm(idx + 10); + } + inline Xmm xmm_output_shift(int idx) { + return Xmm(idx + 12); + } Vmm vmm_zero = Vmm(14); @@ -296,24 +355,34 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ bool do_dequantization = true; inline void load_broadcasted_vectors_only(size_t idx) { - const auto &broadcasted = jqp_.broadcasted; - if (broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) uni_vbroadcastss(vmm_crop_low(idx), ptr[reg_crop_low]); - if (broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) uni_vbroadcastss(vmm_crop_high(idx), ptr[reg_crop_high]); - if (broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) uni_vbroadcastss(vmm_input_scale(idx), ptr[reg_input_scale]); - if (broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) uni_vbroadcastss(vmm_input_shift(idx), ptr[reg_input_shift]); + const auto& broadcasted = jqp_.broadcasted; + if (broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + uni_vbroadcastss(vmm_crop_low(idx), ptr[reg_crop_low]); + if (broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + uni_vbroadcastss(vmm_crop_high(idx), ptr[reg_crop_high]); + if (broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + uni_vbroadcastss(vmm_input_scale(idx), ptr[reg_input_scale]); + if (broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + uni_vbroadcastss(vmm_input_shift(idx), ptr[reg_input_shift]); if (do_dequantization) { - if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) uni_vbroadcastss(vmm_output_scale(idx), ptr[reg_output_scale]); - if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) uni_vbroadcastss(vmm_output_shift(idx), ptr[reg_output_shift]); + if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) + uni_vbroadcastss(vmm_output_scale(idx), ptr[reg_output_scale]); + if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) + uni_vbroadcastss(vmm_output_shift(idx), ptr[reg_output_shift]); } } template inline void load_not_broadcasted_vectors_only(size_t idx, size_t offset) { - const auto &broadcasted = jqp_.broadcasted; - if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) uni_vmovups(T(vmm_crop_low(idx).getIdx()), ptr[reg_crop_low + offset]); - if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) uni_vmovups(T(vmm_crop_high(idx).getIdx()), ptr[reg_crop_high + offset]); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) uni_vmovups(T(vmm_input_scale(idx).getIdx()), ptr[reg_input_scale + offset]); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) uni_vmovups(T(vmm_input_shift(idx).getIdx()), ptr[reg_input_shift + offset]); + const auto& broadcasted = jqp_.broadcasted; + if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + uni_vmovups(T(vmm_crop_low(idx).getIdx()), ptr[reg_crop_low + offset]); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + uni_vmovups(T(vmm_crop_high(idx).getIdx()), ptr[reg_crop_high + offset]); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + uni_vmovups(T(vmm_input_scale(idx).getIdx()), ptr[reg_input_scale + offset]); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + uni_vmovups(T(vmm_input_shift(idx).getIdx()), ptr[reg_input_shift + offset]); if (do_dequantization) { if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) uni_vmovups(T(vmm_output_scale(idx).getIdx()), ptr[reg_output_scale + offset]); @@ -323,14 +392,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } inline void increase_ptrs_if_not_broadcasted(size_t offset) { - const auto &broadcasted = jqp_.broadcasted; - if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) add(reg_crop_low, offset); - if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) add(reg_crop_high, offset); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) add(reg_input_scale, offset); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) add(reg_input_shift, offset); + const auto& broadcasted = jqp_.broadcasted; + if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + add(reg_crop_low, offset); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + add(reg_crop_high, offset); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + add(reg_input_scale, offset); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + add(reg_input_shift, offset); if (do_dequantization) { - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) add(reg_output_scale, offset); - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) add(reg_output_shift, offset); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) + add(reg_output_scale, offset); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) + add(reg_output_shift, offset); } } @@ -373,7 +448,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vbroadcastss(vmm_output_shift(0), ptr[reg_output_shift]); } - L(main_loop_label); { + L(main_loop_label); + { cmp(reg_work_amount, simd_w); jl(tail_blk4_label, T_NEAR); @@ -383,8 +459,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(vmm_val(i), vmm_val(i), vmm_crop_high(0)); uni_vmaxps(vmm_val(i), vmm_val(i), vmm_crop_low(0)); uni_vfmadd213ps(vmm_val(i), vmm_input_scale(0), vmm_input_shift(0)); - if (do_rounding) uni_vroundps(vmm_val(i), vmm_val(i), 0); - if (do_dequantization) uni_vfmadd213ps(vmm_val(i), vmm_output_scale(0), vmm_output_shift(0)); + if (do_rounding) + uni_vroundps(vmm_val(i), vmm_val(i), 0); + if (do_dequantization) + uni_vfmadd213ps(vmm_val(i), vmm_output_scale(0), vmm_output_shift(0)); store_vector(ptr[reg_to + i * (simd_w / 2) * dst_type_size], vmm_val(i), jqp_.dst_prc); } @@ -396,7 +474,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ jmp(main_loop_label, T_NEAR); } - L(tail_blk4_label); { + L(tail_blk4_label); + { cmp(reg_work_amount, tail_simd_w); jl(tail_blk4_exit_label, T_NEAR); @@ -405,8 +484,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_vector(ptr[reg_to], xmm_val(0), jqp_.dst_prc); @@ -420,7 +501,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ mov(aux_reg_from, reg_from); mov(aux_reg_to, reg_to); - L(tail_loop_label); { + L(tail_loop_label); + { cmp(reg_work_amount, 0); jle(exit_label, T_NEAR); @@ -429,8 +511,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_scalar(ptr[aux_reg_to], xmm_val(0), jqp_.dst_prc); @@ -496,7 +580,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ load_not_broadcasted_vectors_only(i, i * (simd_w / 2) * sizeof(float)); } - L(main_loop_label); { + L(main_loop_label); + { cmp(reg_work_amount, 0); jle(exit_label, T_NEAR); @@ -506,8 +591,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(vmm_val(i), vmm_val(i), vmm_crop_high(i)); uni_vmaxps(vmm_val(i), vmm_val(i), vmm_crop_low(i)); uni_vfmadd213ps(vmm_val(i), vmm_input_scale(i), vmm_input_shift(i)); - if (do_rounding) uni_vroundps(vmm_val(i), vmm_val(i), 0); - if (do_dequantization) uni_vfmadd213ps(vmm_val(i), vmm_output_scale(i), vmm_output_shift(i)); + if (do_rounding) + uni_vroundps(vmm_val(i), vmm_val(i), 0); + if (do_dequantization) + uni_vfmadd213ps(vmm_val(i), vmm_output_scale(i), vmm_output_shift(i)); store_vector(ptr[reg_to + i * (simd_w / 2) * dst_type_size], vmm_val(i), jqp_.dst_prc); } @@ -531,7 +618,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ load_not_broadcasted_vectors_only(0, 0); - L(tail_blk8_loop_label); { + L(tail_blk8_loop_label); + { cmp(reg_work_amount, 0); jle(tail_blk8_exit_label, T_NEAR); @@ -540,8 +628,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(ymm_val(0), ymm_val(0), ymm_crop_high(0)); uni_vmaxps(ymm_val(0), ymm_val(0), ymm_crop_low(0)); uni_vfmadd213ps(ymm_val(0), ymm_input_scale(0), ymm_input_shift(0)); - if (do_rounding) uni_vroundps(ymm_val(0), ymm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(ymm_val(0), ymm_output_scale(0), ymm_output_shift(0)); + if (do_rounding) + uni_vroundps(ymm_val(0), ymm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(ymm_val(0), ymm_output_scale(0), ymm_output_shift(0)); store_vector(ptr[aux_reg_to], ymm_val(0), jqp_.dst_prc); @@ -571,7 +661,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ load_not_broadcasted_vectors_only(0, 0); - L(tail_blk4_loop_label); { + L(tail_blk4_loop_label); + { cmp(reg_work_amount, 0); jle(tail_blk4_exit_label, T_NEAR); @@ -580,8 +671,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_vector(ptr[aux_reg_to], xmm_val(0), jqp_.dst_prc); @@ -608,13 +701,14 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ mov(aux_reg_from, reg_from); mov(reg_work_amount, ptr[param + GET_OFF(work_amount)]); - L(tail_loop_label); { + L(tail_loop_label); + { cmp(reg_work_amount, 0); jle(exit_label, T_NEAR); Label end_unroll; auto tail_unroll = [&](size_t iter) { - const auto &broadcasted = jqp_.broadcasted; + const auto& broadcasted = jqp_.broadcasted; for (size_t i = 0; i < iter; i++) { if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) uni_vmovss(xmm_crop_low(0), ptr[reg_crop_low + i * wei_type_size]); @@ -636,8 +730,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_scalar(ptr[aux_reg_to + i * dst_type_size], xmm_val(0), jqp_.dst_prc); } @@ -667,20 +763,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ L(exit_label); } - inline void load_vector(Zmm zmm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_vector(Zmm zmm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(zmm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(zmm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(zmm_src, op); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(zmm_src, op); + break; + case ov::element::i8: + uni_vpmovsxbd(zmm_src, op); + break; + case ov::element::u8: + uni_vpmovzxbd(zmm_src, op); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -688,20 +784,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void load_vector(Ymm ymm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_vector(Ymm ymm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(ymm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(ymm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(ymm_src, op); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(ymm_src, op); + break; + case ov::element::i8: + uni_vpmovsxbd(ymm_src, op); + break; + case ov::element::u8: + uni_vpmovzxbd(ymm_src, op); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -709,20 +805,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void load_vector(Xmm xmm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_vector(Xmm xmm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(xmm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(xmm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(xmm_src, op); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(xmm_src, op); + break; + case ov::element::i8: + uni_vpmovsxbd(xmm_src, op); + break; + case ov::element::u8: + uni_vpmovzxbd(xmm_src, op); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -730,22 +826,22 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_scalar(Xmm xmm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovss(xmm_src, op); - break; - case ov::element::i8: - movsx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - case ov::element::u8: - movzx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovss(xmm_src, op); + break; + case ov::element::i8: + movsx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + case ov::element::u8: + movzx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -753,29 +849,29 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void store_vector(const Xbyak::Address &op, Zmm zmm_dst, ov::element::Type dst_prc) { + inline void store_vector(const Xbyak::Address& op, Zmm zmm_dst, ov::element::Type dst_prc) { if (dst_prc != ov::element::f32) { uni_vcvtps2dq(zmm_dst, zmm_dst); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, zmm_dst); - break; - case ov::element::i8: - vpmovsdb(op, zmm_dst); - break; - case ov::element::u8: - vpmaxsd(zmm_dst, zmm_dst, vmm_zero); - vpmovusdb(op, zmm_dst); - break; - default: - assert(!"unknown dst_prc"); - } - } - - inline void store_vector(const Xbyak::Address &op, Ymm ymm_dst, ov::element::Type dst_prc) { + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, zmm_dst); + break; + case ov::element::i8: + vpmovsdb(op, zmm_dst); + break; + case ov::element::u8: + vpmaxsd(zmm_dst, zmm_dst, vmm_zero); + vpmovusdb(op, zmm_dst); + break; + default: + assert(!"unknown dst_prc"); + } + } + + inline void store_vector(const Xbyak::Address& op, Ymm ymm_dst, ov::element::Type dst_prc) { Xmm xmm_dst = Xmm(ymm_dst.getIdx()); if (dst_prc != ov::element::f32) { @@ -783,82 +879,82 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, ymm_dst); - break; - case ov::element::i8: - uni_vpackssdw(ymm_dst, ymm_dst, ymm_dst); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, ymm_dst); + break; + case ov::element::i8: + uni_vpackssdw(ymm_dst, ymm_dst, ymm_dst); - vpermq(ymm_dst, ymm_dst, 0x08); + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(ymm_dst, ymm_dst, ymm_dst); + uni_vpacksswb(ymm_dst, ymm_dst, ymm_dst); - vmovq(op, xmm_dst); - break; - case ov::element::u8: - uni_vpackusdw(ymm_dst, ymm_dst, ymm_dst); + vmovq(op, xmm_dst); + break; + case ov::element::u8: + uni_vpackusdw(ymm_dst, ymm_dst, ymm_dst); - vpermq(ymm_dst, ymm_dst, 0x08); + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(ymm_dst, ymm_dst, ymm_dst); + uni_vpackuswb(ymm_dst, ymm_dst, ymm_dst); - vmovq(op, xmm_dst); - break; - default: - assert(!"unknown dst_prc"); + vmovq(op, xmm_dst); + break; + default: + assert(!"unknown dst_prc"); } } - inline void store_vector(const Xbyak::Address &op, Xmm xmm_dst, ov::element::Type dst_prc) { + inline void store_vector(const Xbyak::Address& op, Xmm xmm_dst, ov::element::Type dst_prc) { if (dst_prc != ov::element::f32) { uni_vcvtps2dq(xmm_dst, xmm_dst); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, xmm_dst); - break; - case ov::element::i8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovd(op, xmm_dst); - break; - case ov::element::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovd(op, xmm_dst); - break; - default: - assert(!"unknown dst_prc"); - } - } - - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, ov::element::Type dst_prc) { + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, xmm_dst); + break; + case ov::element::i8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovd(op, xmm_dst); + break; + case ov::element::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovd(op, xmm_dst); + break; + default: + assert(!"unknown dst_prc"); + } + } + + inline void store_scalar(const Xbyak::Address& op, Xmm xmm_dst, ov::element::Type dst_prc) { if (dst_prc != ov::element::f32) { uni_vcvtps2dq(xmm_dst, xmm_dst); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovss(op, xmm_dst); - break; - case ov::element::i8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - case ov::element::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - assert(!"unknown dst_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovss(op, xmm_dst); + break; + case ov::element::i8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case ov::element::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + assert(!"unknown dst_prc"); } } }; @@ -877,7 +973,8 @@ bool FakeQuantize::isSupportedOperation(const std::shared_ptr& o } for (size_t i = 1; i < fq->get_input_size(); i++) { if (fq->get_input_partial_shape(i).rank().get_length() > 5) { - errorMessage = "Doesn't support 'range' input with rank: " + std::to_string(fq->get_input_partial_shape(i).rank().get_length()); + errorMessage = "Doesn't support 'range' input with rank: " + + std::to_string(fq->get_input_partial_shape(i).rank().get_length()); return false; } } @@ -935,7 +1032,7 @@ struct FakeQuantKey { seed = hash_combine(seed, jqp.wei_prc.hash()); seed = hash_combine(seed, jqp.dst_prc.hash()); seed = hash_combine(seed, jqp.op_type); - if (jqp.op_type == Algorithm::FQBinarization) { + if (jqp.op_type == Algorithm::FQBinarization) { seed = hash_combine(seed, jqp.c); } else { seed = hash_combine(seed, jqp.broadcasted); @@ -959,8 +1056,8 @@ struct FakeQuantKey { }; } // namespace -FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, PassThroughShapeInferFactory()) { +FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, PassThroughShapeInferFactory()) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { algorithm = Algorithm::FQCommon; @@ -1032,16 +1129,20 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte OPENVINO_THROW(errorPrefix, "has different quantization axis size on 'data' and 'range' inputs"); } - const auto inputLowNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(1)); + const auto inputLowNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(1)); auto inputLowData = inputLowNode->cast_vector(); - const auto inputHighNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(2)); + const auto inputHighNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(2)); auto inputHighData = inputHighNode->cast_vector(); - const auto outputLowNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(3)); + const auto outputLowNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(3)); auto outputLowData = outputLowNode->cast_vector(); - const auto outputHighNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(4)); + const auto outputHighNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(4)); auto outputHighData = outputHighNode->cast_vector(); binarization = levels == 2; @@ -1092,7 +1193,7 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte } } } else { - auto allElementsAreEqual = [&](const std::vector &data, size_t size) { + auto allElementsAreEqual = [&](const std::vector& data, size_t size) { if (size == 0) return true; @@ -1146,9 +1247,21 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] = outputScaleSize == 1; broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] = outputShiftSize == 1; - if (everyone_is(1u, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize)) + if (everyone_is(1u, + cropLowSize, + cropHighSize, + inputScaleSize, + inputShiftSize, + outputScaleSize, + outputShiftSize)) broadcastingPolicy = PerTensor; - else if (one_of(1u, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize)) + else if (one_of(1u, + cropLowSize, + cropHighSize, + inputScaleSize, + inputShiftSize, + outputScaleSize, + outputShiftSize)) broadcastingPolicy = Mixed; else broadcastingPolicy = PerChannel; @@ -1224,7 +1337,10 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte bool isFakeQuantization = true; bool isFakeQuantizationWithScale = true; - for (size_t i = 0; i < std::max(inputLowAxisSize, std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); i++) { + for (size_t i = 0; + i < std::max(inputLowAxisSize, + std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); + i++) { float il = inputLowData[isInputLowBroadcasted ? 0 : i]; float ol = outputLowData[isOutputLowBroadcasted ? 0 : i]; float ih = inputHighData[isInputHighBroadcasted ? 0 : i]; @@ -1236,7 +1352,10 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte } if (isFakeQuantizationWithScale) { - for (size_t i = 0; i < std::max(inputLowAxisSize, std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); i++) { + for (size_t i = 0; + i < std::max(inputLowAxisSize, + std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); + i++) { float il = inputLowData[isInputLowBroadcasted ? 0 : i]; float ol = outputLowData[isOutputLowBroadcasted ? 0 : i]; float ih = inputHighData[isInputHighBroadcasted ? 0 : i]; @@ -1255,22 +1374,22 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte std::vector FakeQuantize::getDataFormats() const { // Special case for first FQ in the network - const auto &dims = getInputShapeAtPort(0).getDims(); + const auto& dims = getInputShapeAtPort(0).getDims(); if (dims[getAxis()] == 3) { - return { LayoutType::ncsp }; + return {LayoutType::ncsp}; } else { if (isBinarization()) { - return { LayoutType::nspc }; + return {LayoutType::nspc}; } else { if (one_of(dims.size(), 4u, 5u)) { if (getAxis() == 1) { auto blkFormat = mayiuse(cpu::x64::avx512_core) ? LayoutType::nCsp16c : LayoutType::nCsp8c; - return { blkFormat, LayoutType::nspc, LayoutType::ncsp }; + return {blkFormat, LayoutType::nspc, LayoutType::ncsp}; } else { - return { LayoutType::ncsp }; + return {LayoutType::ncsp}; } } else { - return { LayoutType::ncsp }; + return {LayoutType::ncsp}; } } } @@ -1284,10 +1403,12 @@ void FakeQuantize::init() { inputPrecision = getOriginalInputPrecisionAtPort(0); outputPrecision = getOriginalOutputPrecisionAtPort(0); - if (inputPrecision != ov::element::f32 && inputPrecision != ov::element::u8 && inputPrecision != ov::element::i8) + if (inputPrecision != ov::element::f32 && inputPrecision != ov::element::u8 && + inputPrecision != ov::element::i8) inputPrecision = ov::element::f32; - if (outputPrecision != ov::element::f32 && outputPrecision != ov::element::u8 && outputPrecision != ov::element::i8) + if (outputPrecision != ov::element::f32 && outputPrecision != ov::element::u8 && + outputPrecision != ov::element::i8) outputPrecision = ov::element::f32; } } @@ -1381,7 +1502,8 @@ bool FakeQuantize::needPrepareParams() const { if (!selectedPrimitiveDescriptor) OPENVINO_THROW("CPU quantize node with name '", getName(), "' doesn't have primitive descriptors."); - if (internalBlobMemory.empty() || (selectedPrimitiveDescriptor->getImplementationType() != impl_desc_type::ref && inputShapesModified())) { + if (internalBlobMemory.empty() || + (selectedPrimitiveDescriptor->getImplementationType() != impl_desc_type::ref && inputShapesModified())) { return true; } @@ -1389,7 +1511,8 @@ bool FakeQuantize::needPrepareParams() const { const auto newPaddedSize = rnd_up(axisSize, 16); const auto currPaddedSize = rnd_up(currentAxisSize, 16); - return newPaddedSize != currPaddedSize || ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize); + return newPaddedSize != currPaddedSize || + ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize); } return false; } @@ -1401,26 +1524,33 @@ void FakeQuantize::prepareParams() { OPENVINO_ASSERT(newPaddedSize != 0); if (internalBlobMemory.empty() || newPaddedSize != rnd_up(currentAxisSize, 16) || - ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize)) { - DnnlBlockedMemoryDesc weightsDataDesc(Shape(VectorDims{newPaddedSize}), memory::data_type::f32, memory::format_tag::x); + ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize)) { + DnnlBlockedMemoryDesc weightsDataDesc(Shape(VectorDims{newPaddedSize}), + memory::data_type::f32, + memory::format_tag::x); constexpr size_t numBinFqIntBlob = 2; bool needUpdThr = false, needUpdMask = false; if (isInputLowBroadcasted && axisSize != currentAxisSize) { binarizationThresholds.resize(newPaddedSize); - std::fill(binarizationThresholds.begin() + 1, binarizationThresholds.begin() + axisSize, binarizationThresholds[0]); + std::fill(binarizationThresholds.begin() + 1, + binarizationThresholds.begin() + axisSize, + binarizationThresholds[0]); std::fill(binarizationThresholds.begin() + axisSize, binarizationThresholds.end(), 0.f); needUpdThr = true; } if (isOutputHighBroadcasted && axisSize != currentAxisSize) { binarizationOutputMask.resize(newPaddedSize); - std::fill(binarizationOutputMask.begin() + 1, binarizationOutputMask.begin() + axisSize, binarizationOutputMask[0]); + std::fill(binarizationOutputMask.begin() + 1, + binarizationOutputMask.begin() + axisSize, + binarizationOutputMask[0]); std::fill(binarizationOutputMask.begin() + axisSize, binarizationOutputMask.end(), 0); needUpdMask = true; } if (internalBlobMemory.empty() || needUpdThr) { - auto binarizationThresholdsDataMem = std::make_shared(getEngine(), weightsDataDesc, getBinarizationTresholdsPtr()); + auto binarizationThresholdsDataMem = + std::make_shared(getEngine(), weightsDataDesc, getBinarizationTresholdsPtr()); if (internalBlobMemory.empty()) { internalBlobMemory.push_back(binarizationThresholdsDataMem); } else { @@ -1429,7 +1559,8 @@ void FakeQuantize::prepareParams() { } if (internalBlobMemory.size() == (numBinFqIntBlob - 1) || needUpdMask) { - auto binarizationMaskDataMem = std::make_shared(getEngine(), weightsDataDesc, getBinarizationOutputMaskPtr()); + auto binarizationMaskDataMem = + std::make_shared(getEngine(), weightsDataDesc, getBinarizationOutputMaskPtr()); if (internalBlobMemory.size() == (numBinFqIntBlob - 1)) { internalBlobMemory.push_back(binarizationMaskDataMem); } else { @@ -1449,31 +1580,39 @@ void FakeQuantize::createPrimitive() { if (selectedPrimitiveDescriptor->getImplementationType() != impl_desc_type::ref) { const auto& config = getSelectedPrimitiveDescriptor()->getConfig(); - //Form FakeQuanKey + // Form FakeQuanKey FakeQuantKey key = {}; key.jqp.src_prc = config.inConfs[0].getMemDesc()->getPrecision(); key.jqp.wei_prc = ov::element::f32; key.jqp.dst_prc = config.outConfs[0].getMemDesc()->getPrecision(); - const auto &srcMemory = getParentEdgeAt(0)->getMemory(); - const auto &srcDesc = srcMemory.getDesc(); + const auto& srcMemory = getParentEdgeAt(0)->getMemory(); + const auto& srcDesc = srcMemory.getDesc(); key.jqp.is_planar = srcDesc.hasLayoutType(LayoutType::ncsp) && one_of(srcDesc.getShape().getRank(), 3u, 4u, 5u); key.jqp.op_type = getAlgorithm(); if (isBinarization()) { - const auto &inDims = srcMemory.getStaticDims(); + const auto& inDims = srcMemory.getStaticDims(); key.jqp.c = inDims.size() > 1 ? inDims[1] : 1; } else { - // in case of blocked layout we need to extend vectors to prevent read from unallocated memory - size_t paddedSize = srcDesc.hasLayoutType(LayoutType::nCsp16c) ? 16 : srcDesc.hasLayoutType(LayoutType::nCsp8c) ? 8 : 1; + // in case of blocked layout we need to extend vectors to prevent read from unallocated memory + size_t paddedSize = srcDesc.hasLayoutType(LayoutType::nCsp16c) ? 16 + : srcDesc.hasLayoutType(LayoutType::nCsp8c) ? 8 + : 1; if (paddedSize != 1) { - if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) cropLow.resize(rnd_up(cropLow.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) cropHigh.resize(rnd_up(cropHigh.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) inputScale.resize(rnd_up(inputScale.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) inputShift.resize(rnd_up(inputShift.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) outputScale.resize(rnd_up(outputScale.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) outputShift.resize(rnd_up(outputShift.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + cropLow.resize(rnd_up(cropLow.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + cropHigh.resize(rnd_up(cropHigh.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + inputScale.resize(rnd_up(inputScale.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + inputShift.resize(rnd_up(inputShift.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) + outputScale.resize(rnd_up(outputScale.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) + outputShift.resize(rnd_up(outputShift.size(), paddedSize)); } key.jqp.broadcasted = broadcasted; @@ -1530,11 +1669,10 @@ void FakeQuantize::executeReference() { parallel_nd(N, CB, D, H, W, [&](dim_t n, dim_t cb, dim_t d, dim_t h, dim_t w) { uint8_t bin_val = 0x00; for (int c = cb * nbits, shift = 0; c < std::min(static_cast(C), (cb + 1) * nbits); c++, shift++) { - size_t src_off = srcDims.size() == 4 ? - n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] : - srcDims.size() == 5 ? - n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] : - n * s_str[0] + c * s_str[1]; + size_t src_off = srcDims.size() == 4 ? n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] + : srcDims.size() == 5 + ? n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] + : n * s_str[0] + c * s_str[1]; float val = src[src_off]; float thr = thresholds[c]; @@ -1546,11 +1684,10 @@ void FakeQuantize::executeReference() { bin_val |= (bit << shift); } - size_t dst_off = dstDims.size() == 4 ? - n * d_str[0] + (cb * nbits) * d_str[1] + h * d_str[2] + w * d_str[3] : - dstDims.size() == 5 ? - n * d_str[0] + (cb * nbits) * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] : - n * d_str[0] + (cb * nbits) * d_str[1]; + size_t dst_off = dstDims.size() == 4 ? n * d_str[0] + (cb * nbits) * d_str[1] + h * d_str[2] + w * d_str[3] + : dstDims.size() == 5 + ? n * d_str[0] + (cb * nbits) * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] + : n * d_str[0] + (cb * nbits) * d_str[1]; dst[dst_off / nbits] = bin_val; }); @@ -1558,46 +1695,44 @@ void FakeQuantize::executeReference() { auto dst = dstMemory->getDataAs(); parallel_nd(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { - size_t src_off = srcDims.size() == 5 ? - n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] : - srcDims.size() == 4 ? - n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] : - srcDims.size() == 3 ? - n * s_str[0] + c * s_str[1] + h * s_str[2] : - srcDims.size() == 2 ? - n * s_str[0] + c * s_str[1] : - n * s_str[0]; + size_t src_off = srcDims.size() == 5 + ? n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] + : srcDims.size() == 4 ? n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] + : srcDims.size() == 3 ? n * s_str[0] + c * s_str[1] + h * s_str[2] + : srcDims.size() == 2 ? n * s_str[0] + c * s_str[1] + : n * s_str[0]; float src_val = src[src_off]; int wei_idx = getAxis() == 0 ? n : c; float cl = broadcasted[static_cast(FQ_add_input_type::CROP_LOW)] ? cropLow[0] : cropLow[wei_idx]; float ch = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? cropHigh[0] : cropHigh[wei_idx]; - float isc = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? inputScale[0] : inputScale[wei_idx]; - float ish = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? inputShift[0] : inputShift[wei_idx]; - float osc = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? outputScale[0] : outputScale[wei_idx]; - float osh = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? outputShift[0] : outputShift[wei_idx]; + float isc = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? inputScale[0] : inputScale[wei_idx]; + float ish = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? inputShift[0] : inputShift[wei_idx]; + float osc = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? outputScale[0] + : outputScale[wei_idx]; + float osh = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? outputShift[0] + : outputShift[wei_idx]; float dst_val = nstl::min(ch, nstl::max(cl, src_val)); dst_val = dst_val * isc + ish; dst_val = roundf(dst_val); dst_val = dst_val * osc + osh; - size_t dst_off = dstDims.size() == 5 ? - n * d_str[0] + c * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] : - dstDims.size() == 4 ? - n * d_str[0] + c * d_str[1] + h * d_str[2] + w * d_str[3] : - dstDims.size() == 3 ? - n * d_str[0] + c * d_str[1] + h * d_str[2] : - dstDims.size() == 2 ? - n * d_str[0] + c * d_str[1] : - n * d_str[0]; + size_t dst_off = dstDims.size() == 5 + ? n * d_str[0] + c * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] + : dstDims.size() == 4 ? n * d_str[0] + c * d_str[1] + h * d_str[2] + w * d_str[3] + : dstDims.size() == 3 ? n * d_str[0] + c * d_str[1] + h * d_str[2] + : dstDims.size() == 2 ? n * d_str[0] + c * d_str[1] + : n * d_str[0]; dst[dst_off] = dst_val; }); } } -void FakeQuantize::executeBinarization(const std::unique_ptr &pKernel) const { +void FakeQuantize::executeBinarization(const std::unique_ptr& pKernel) const { #if defined(OPENVINO_ARCH_X86_64) auto srcMemory = getSrcMemoryAtPort(0); auto dstMemory = getDstMemoryAtPort(0); @@ -1628,8 +1763,8 @@ void FakeQuantize::executeBinarization(const std::unique_ptr &pKernel) const { +void FakeQuantize::executeQuantization(const std::unique_ptr& pKernel) const { #if defined(OPENVINO_ARCH_X86_64) auto srcMemory = getSrcMemoryAtPort(0); auto dstMemory = getDstMemoryAtPort(0); @@ -1651,10 +1786,11 @@ void FakeQuantize::executeQuantization(const std::unique_ptrjqp_; + const auto& jqp = pKernel->jqp_; auto src_type_size = jqp.src_prc.size(); auto dst_type_size = jqp.dst_prc.size(); @@ -1691,15 +1827,20 @@ void FakeQuantize::executeQuantization(const std::unique_ptr(FQ_add_input_type::CROP_LOW)] ? &cropLow[0] : &cropLow[c]; - arg.crop_high = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; - arg.input_scale = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; - arg.input_shift = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; - arg.output_scale = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; - arg.output_shift = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; - - arg.src_step = (size_t) blk_size * src_type_size; - arg.dst_step = (size_t) blk_size * dst_type_size; - arg.block_size = (size_t) blk_size; + arg.crop_high = + broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; + arg.input_scale = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; + arg.input_shift = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; + arg.output_scale = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; + arg.output_shift = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; + + arg.src_step = (size_t)blk_size * src_type_size; + arg.dst_step = (size_t)blk_size * dst_type_size; + arg.block_size = (size_t)blk_size; arg.work_amount = (size_t)H; (*pKernel)(&arg); @@ -1714,22 +1855,27 @@ void FakeQuantize::executeQuantization(const std::unique_ptr(FQ_add_input_type::CROP_LOW)] ? &cropLow[0] : &cropLow[c]; - arg.crop_high = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; - arg.input_scale = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; - arg.input_shift = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; - arg.output_scale = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; - arg.output_shift = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; - - arg.src_step = is_blk_format ? (size_t) blk_size * src_type_size : (size_t) C * src_type_size; - arg.dst_step = is_blk_format ? (size_t) blk_size * dst_type_size : (size_t) C * dst_type_size; - arg.block_size = is_blk_format ? (size_t) blk_size : nstl::min(blk_size, C - c); + arg.crop_high = + broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; + arg.input_scale = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; + arg.input_shift = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; + arg.output_scale = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; + arg.output_shift = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; + + arg.src_step = is_blk_format ? (size_t)blk_size * src_type_size : (size_t)C * src_type_size; + arg.dst_step = is_blk_format ? (size_t)blk_size * dst_type_size : (size_t)C * dst_type_size; + arg.block_size = is_blk_format ? (size_t)blk_size : nstl::min(blk_size, C - c); arg.work_amount = (size_t)std::min(static_cast(batch_size), H * W - b * batch_size); (*pKernel)(&arg); @@ -1740,25 +1886,29 @@ void FakeQuantize::executeQuantization(const std::unique_ptr(FQ_add_input_type::CROP_LOW)] ? &cropLow[0] : &cropLow[c]; - arg.crop_high = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; - arg.input_scale = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; - arg.input_shift = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; - arg.output_scale = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; - arg.output_shift = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; - - arg.src_step = is_blk_format ? (size_t) blk_size * src_type_size : (size_t) C * src_type_size; - arg.dst_step = is_blk_format ? (size_t) blk_size * dst_type_size : (size_t) C * dst_type_size; - arg.block_size = (is_blk_format && srcDims.size() != 2) ? (size_t) blk_size : nstl::min(blk_size, C - c); - arg.work_amount = (size_t) W; + arg.crop_high = + broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; + arg.input_scale = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; + arg.input_shift = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; + arg.output_scale = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; + arg.output_shift = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; + + arg.src_step = is_blk_format ? (size_t)blk_size * src_type_size : (size_t)C * src_type_size; + arg.dst_step = is_blk_format ? (size_t)blk_size * dst_type_size : (size_t)C * dst_type_size; + arg.block_size = (is_blk_format && srcDims.size() != 2) ? (size_t)blk_size : nstl::min(blk_size, C - c); + arg.work_amount = (size_t)W; (*pKernel)(&arg); }); @@ -1778,7 +1928,7 @@ void FakeQuantize::execute(dnnl::stream strm) { } } -void FakeQuantize::initializePostOpData(const VectorDims &dims, const size_t bufferAlignment, bool doRounding) { +void FakeQuantize::initializePostOpData(const VectorDims& dims, const size_t bufferAlignment, bool doRounding) { if (postOpDataVersion == parameterVersion) return; @@ -1789,11 +1939,15 @@ void FakeQuantize::initializePostOpData(const VectorDims &dims, const size_t buf binarizationOutputMask.resize(axisPaddedSize, 0); if (isInputLowBroadcasted) { - std::fill(binarizationThresholds.begin() + 1, binarizationThresholds.begin() + realAxisSize, binarizationThresholds[0]); + std::fill(binarizationThresholds.begin() + 1, + binarizationThresholds.begin() + realAxisSize, + binarizationThresholds[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } if (isOutputHighBroadcasted) { - std::fill(binarizationOutputMask.begin() + 1, binarizationOutputMask.begin() + realAxisSize, binarizationOutputMask[0]); + std::fill(binarizationOutputMask.begin() + 1, + binarizationOutputMask.begin() + realAxisSize, + binarizationOutputMask[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } } else { @@ -1803,7 +1957,7 @@ void FakeQuantize::initializePostOpData(const VectorDims &dims, const size_t buf postOpDataVersion = parameterVersion; } -void FakeQuantize::initializePostOpDataLegacy(const VectorDims &dims, const size_t bufferAlignment) { +void FakeQuantize::initializePostOpDataLegacy(const VectorDims& dims, const size_t bufferAlignment) { if (legacyPostOpDataVersion == parameterVersion) return; @@ -1815,11 +1969,15 @@ void FakeQuantize::initializePostOpDataLegacy(const VectorDims &dims, const size binarizationOutputMask.resize(axisPaddedSize, 0); if (isInputLowBroadcasted) { - std::fill(binarizationThresholds.begin() + 1, binarizationThresholds.begin() + realAxisSize, binarizationThresholds[0]); + std::fill(binarizationThresholds.begin() + 1, + binarizationThresholds.begin() + realAxisSize, + binarizationThresholds[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } if (isOutputHighBroadcasted) { - std::fill(binarizationOutputMask.begin() + 1, binarizationOutputMask.begin() + realAxisSize, binarizationOutputMask[0]); + std::fill(binarizationOutputMask.begin() + 1, + binarizationOutputMask.begin() + realAxisSize, + binarizationOutputMask[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } @@ -1839,7 +1997,10 @@ void FakeQuantize::initializePostOpDataLegacy(const VectorDims &dims, const size legacyPostOpDataVersion = parameterVersion; } -void FakeQuantize::appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void FakeQuantize::appendMemory(const size_t dataSize, + const void* data, + MemoryPtr& memPtr, + std::vector& postOpsMem) { if (!memPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::f32, {dataSize}); memPtr = std::make_shared(getEngine(), memoryDesc, data); @@ -1848,12 +2009,15 @@ void FakeQuantize::appendMemory(const size_t dataSize, const void *data, MemoryP } } -void FakeQuantize::appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void FakeQuantize::appendMemory(const size_t dataSize, + const void* data, + MemoryPtr& memPtr, + std::vector& postOpsMem) { postOpsMem.push_back(data); } template -void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem) { +void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims& postOpDims, std::vector& postOpsMem) { // try to map fakeQuantizeNode using output scale & eltwise first // if failed, fallback to append_quantization() @@ -1865,21 +2029,40 @@ void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &post initializePostOpDataLegacy(postOpDims, bufferAlignment); if (getAlgorithm() == Algorithm::FQBinarization) { - ops.append_binarization(dnnl::algorithm::binarization_depthwise, (const float*)&binarizationThresholds[0], (const float*)&binarizationOutputMask[0]); + ops.append_binarization(dnnl::algorithm::binarization_depthwise, + (const float*)&binarizationThresholds[0], + (const float*)&binarizationOutputMask[0]); } else { - dnnl::algorithm alg = getAlgorithm() == Algorithm::FQQuantization ? dnnl::algorithm::quantization_quantize : - dnnl::algorithm::quantization_quantize_dequantize; + dnnl::algorithm alg = getAlgorithm() == Algorithm::FQQuantization + ? dnnl::algorithm::quantization_quantize + : dnnl::algorithm::quantization_quantize_dequantize; - std::array per_channel = {cropLowSize > 1, cropHighSize > 1, inputScaleSize > 1, - inputShiftSize > 1, outputScaleSize > 1, outputShiftSize > 1}; + std::array per_channel = {cropLowSize > 1, + cropHighSize > 1, + inputScaleSize > 1, + inputShiftSize > 1, + outputScaleSize > 1, + outputShiftSize > 1}; std::array all_default = {false}; - all_default[0] = std::all_of(cropLow.cbegin(), cropLow.cend(), [](float val){ return val == 0.f; }); - all_default[1] = std::all_of(cropHigh.cbegin(), cropHigh.cend(), [](float val){ return val == 0.f; }); - all_default[2] = std::all_of(inputScale.cbegin(), inputScale.cend(), [](float val){ return val == 1.f; }); - all_default[3] = std::all_of(inputShift.cbegin(), inputShift.cend(), [](float val){ return val == 0.f; }); - all_default[4] = std::all_of(outputScale.cbegin(), outputScale.cend(), [](float val){ return val == 1.f; }); - all_default[5] = std::all_of(outputShift.cbegin(), outputShift.cend(), [](float val){ return val == 0.f; }); + all_default[0] = std::all_of(cropLow.cbegin(), cropLow.cend(), [](float val) { + return val == 0.f; + }); + all_default[1] = std::all_of(cropHigh.cbegin(), cropHigh.cend(), [](float val) { + return val == 0.f; + }); + all_default[2] = std::all_of(inputScale.cbegin(), inputScale.cend(), [](float val) { + return val == 1.f; + }); + all_default[3] = std::all_of(inputShift.cbegin(), inputShift.cend(), [](float val) { + return val == 0.f; + }); + all_default[4] = std::all_of(outputScale.cbegin(), outputScale.cend(), [](float val) { + return val == 1.f; + }); + all_default[5] = std::all_of(outputShift.cbegin(), outputShift.cend(), [](float val) { + return val == 0.f; + }); std::array offsets = {0}; offsets[1] = offsets[0] + cropLowSize; @@ -1894,7 +2077,9 @@ void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &post } } -void FakeQuantize::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, +void FakeQuantize::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, const int channelAxis) { std::vector postOpsMemPtrs; appendPostOpsImpl(ops, postOpDims, postOpsMemPtrs); @@ -1906,7 +2091,9 @@ void FakeQuantize::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDi } } -void FakeQuantize::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, +void FakeQuantize::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, const int channelAxis) { appendPostOpsImpl(ops, postOpDims, postOpsMem); } @@ -1957,7 +2144,7 @@ void FakeQuantize::updateOptimizedFormula(bool do_rounding) { // per-channel FQ. if (isPerTensor(inputShift, inputShift[0], 0.00005f)) { f.ish.resize(OC); - for (auto & v : f.ish) + for (auto& v : f.ish) v = inputShift[0]; } else { f.ish = inputShift; @@ -2115,7 +2302,7 @@ bool FakeQuantize::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, return true; } -FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) { +FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params& _jqp) { #if defined(OPENVINO_ARCH_X86_64) bool isBinarization = _jqp.op_type == Algorithm::FQBinarization; if (mayiuse(cpu::x64::avx512_core)) { @@ -2157,6 +2344,6 @@ bool FakeQuantize::created() const { return getType() == Type::FakeQuantize; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.h b/src/plugins/intel_cpu/src/nodes/fake_quantize.h index 62aea6092451a6..af34c0b91a1a7a 100644 --- a/src/plugins/intel_cpu/src/nodes/fake_quantize.h +++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.h @@ -4,25 +4,17 @@ #pragma once -#include "common/primitive_attr.hpp" -#include "node.h" - #include + +#include "common/primitive_attr.hpp" #include "dnnl_postops_composer_legacy.h" +#include "node.h" namespace ov { namespace intel_cpu { namespace node { -enum class FQ_add_input_type { - CROP_LOW, - CROP_HIGH, - INPUT_SCALE, - INPUT_SHIFT, - OUTPUT_SCALE, - OUTPUT_SHIFT, - INPUTS_SIZE -}; +enum class FQ_add_input_type { CROP_LOW, CROP_HIGH, INPUT_SCALE, INPUT_SHIFT, OUTPUT_SCALE, OUTPUT_SHIFT, INPUTS_SIZE }; struct jit_quantize_params { bool is_planar; @@ -33,8 +25,8 @@ struct jit_quantize_params { Algorithm op_type; - int c; // need only for binarization - std::bitset(FQ_add_input_type::INPUTS_SIZE)> broadcasted; // need only for quantization + int c; // need only for binarization + std::bitset(FQ_add_input_type::INPUTS_SIZE)> broadcasted; // need only for quantization }; struct jit_quantize_call_args { @@ -57,9 +49,9 @@ struct jit_quantize_call_args { }; struct jit_uni_quantize_kernel { - void (*ker_)(const jit_quantize_call_args *); + void (*ker_)(const jit_quantize_call_args*); - void operator()(const jit_quantize_call_args *args) { + void operator()(const jit_quantize_call_args* args) { assert(ker_); ker_(args); } @@ -82,58 +74,116 @@ class FakeQuantize : public Node { void execute(dnnl::stream strm) override; void executeDynamicImpl(dnnl::stream strm) override; - size_t getAxis() const { return axis; } + size_t getAxis() const { + return axis; + } - bool isBinarization() const { return getAlgorithm() == Algorithm::FQBinarization; } + bool isBinarization() const { + return getAlgorithm() == Algorithm::FQBinarization; + } bool needPrepareParams() const override; void prepareParams() override; void createPrimitive() override; - const float* getBinarizationTresholdsPtr() const { return &binarizationThresholds[0]; } - const float* getBinarizationOutputMaskPtr() const { return reinterpret_cast(&binarizationOutputMask[0]); } - size_t getBinarizationTresholdsSize() const { return binarizationThresholds.size(); } - size_t getBinarizationOutputMaskSize() const { return binarizationOutputMask.size(); } + const float* getBinarizationTresholdsPtr() const { + return &binarizationThresholds[0]; + } + const float* getBinarizationOutputMaskPtr() const { + return reinterpret_cast(&binarizationOutputMask[0]); + } + size_t getBinarizationTresholdsSize() const { + return binarizationThresholds.size(); + } + size_t getBinarizationOutputMaskSize() const { + return binarizationOutputMask.size(); + } - const std::vector& getCropLow() const { return cropLow; } - const std::vector& getCropHigh() const { return cropHigh; } - const std::vector& getInputScale() const { return inputScale; } - const std::vector& getInputShift() const { return inputShift; } - const std::vector& getOutputScale() const { return outputScale; } - const std::vector& getOutputShift() const { return outputShift; } - const size_t getLevels() const { return levels; } + const std::vector& getCropLow() const { + return cropLow; + } + const std::vector& getCropHigh() const { + return cropHigh; + } + const std::vector& getInputScale() const { + return inputScale; + } + const std::vector& getInputShift() const { + return inputShift; + } + const std::vector& getOutputScale() const { + return outputScale; + } + const std::vector& getOutputShift() const { + return outputShift; + } + const size_t getLevels() const { + return levels; + } void setCropLow(std::vector newCropLow) { - cropLow = std::move(newCropLow); cropLowSize = cropLow.size(); ++parameterVersion; + cropLow = std::move(newCropLow); + cropLowSize = cropLow.size(); + ++parameterVersion; } void setCropHigh(std::vector newCropHigh) { - cropHigh = std::move(newCropHigh); cropHighSize = cropHigh.size(); ++parameterVersion; + cropHigh = std::move(newCropHigh); + cropHighSize = cropHigh.size(); + ++parameterVersion; } void setInputScale(std::vector newInputScale) { - inputScale = std::move(newInputScale); inputScaleSize = inputScale.size(); ++parameterVersion; + inputScale = std::move(newInputScale); + inputScaleSize = inputScale.size(); + ++parameterVersion; } void setInputShift(std::vector newInputShift) { - inputShift = std::move(newInputShift); inputShiftSize = inputShift.size(); ++parameterVersion; + inputShift = std::move(newInputShift); + inputShiftSize = inputShift.size(); + ++parameterVersion; } void setOutputScale(std::vector newOutputScale) { - outputScale = std::move(newOutputScale); outputScaleSize = outputScale.size(); ++parameterVersion; + outputScale = std::move(newOutputScale); + outputScaleSize = outputScale.size(); + ++parameterVersion; } void setOutputShift(std::vector newOutputShift) { - outputShift = std::move(newOutputShift); outputShiftSize = outputShift.size(); ++parameterVersion; + outputShift = std::move(newOutputShift); + outputShiftSize = outputShift.size(); + ++parameterVersion; } - const std::vector& getFQScales() const { return fqScales; } + const std::vector& getFQScales() const { + return fqScales; + } - bool isInputLowBroadcast() const { return isInputLowBroadcasted; } - bool isInputHighBroadcast() const { return isInputHighBroadcasted; } - bool isOutputLowBroadcast() const { return isOutputLowBroadcasted; } - bool isOutputHighBroadcast() const { return isOutputHighBroadcasted; } + bool isInputLowBroadcast() const { + return isInputLowBroadcasted; + } + bool isInputHighBroadcast() const { + return isInputHighBroadcasted; + } + bool isOutputLowBroadcast() const { + return isOutputLowBroadcasted; + } + bool isOutputHighBroadcast() const { + return isOutputHighBroadcasted; + } - ov::element::Type getInputPrecision() const { return inputPrecision; } - ov::element::Type getOutputPrecision() const { return outputPrecision; } + ov::element::Type getInputPrecision() const { + return inputPrecision; + } + ov::element::Type getOutputPrecision() const { + return outputPrecision; + } - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis = 1) override; - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis = 1) override; + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis = 1) override; + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1) override; bool appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastPostOp, dnnl::memory::data_type outDataType, @@ -143,12 +193,14 @@ class FakeQuantize : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; enum BroadcastingPolicy { - PerChannel, // all FQ operations are per channel - PerTensor, // all FQ operations are per tensor - Mixed, // some per channel, some per tensor + PerChannel, // all FQ operations are per channel + PerTensor, // all FQ operations are per tensor + Mixed, // some per channel, some per tensor }; - BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; } + BroadcastingPolicy getBroadcastingPolicy() const { + return broadcastingPolicy; + } MemoryPtr cropLowMemory; MemoryPtr cropHighMemory; @@ -165,22 +217,22 @@ class FakeQuantize : public Node { using executorPtr = std::shared_ptr; executorPtr execPtr = nullptr; struct FakeQuantizeJitExecutor : public FakeQuantizeExecutor { - FakeQuantizeJitExecutor(const jit_quantize_params &_jqp); + FakeQuantizeJitExecutor(const jit_quantize_params& _jqp); void exec(const FakeQuantize& node) override; std::unique_ptr pKernel; }; void init() override; std::vector getDataFormats() const; - void initializePostOpData(const VectorDims &postOpDims, const size_t bufferAlignment, bool doRounding); - void initializePostOpDataLegacy(const VectorDims &dims, const size_t bufferAlignment); + void initializePostOpData(const VectorDims& postOpDims, const size_t bufferAlignment, bool doRounding); + void initializePostOpDataLegacy(const VectorDims& dims, const size_t bufferAlignment); void executeReference(); - void executeBinarization(const std::unique_ptr &pKernel) const; - void executeQuantization(const std::unique_ptr &pKernel) const; + void executeBinarization(const std::unique_ptr& pKernel) const; + void executeQuantization(const std::unique_ptr& pKernel) const; - void appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem); - void appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem); + void appendMemory(const size_t dataSize, const void* data, MemoryPtr& memPtr, std::vector& postOpsMem); + void appendMemory(const size_t dataSize, const void* data, MemoryPtr& memPtr, std::vector& postOpsMem); template - void appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem); + void appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims& postOpDims, std::vector& postOpsMem); size_t levels = 0; @@ -273,6 +325,6 @@ class FakeQuantize : public Node { BroadcastingPolicy broadcastingPolicy; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 7f6ed99b1173d7..2df6c0ae7522cc 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -10,8 +10,10 @@ #include "common/cpu_convert.h" #include "common/cpu_memcpy.h" +#include "cpu_types.h" #include "dnnl_extension_utils.h" #include "executors/memory_arguments.hpp" +#include "fake_quantize.h" #include "graph_context.h" #include "input.h" #include "memory_desc/blocked_memory_desc.h" @@ -19,16 +21,19 @@ #include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/threading/cpu_message.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_compressed.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" #include "post_ops.hpp" #include "shape_inference/custom/fullyconnected.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "transformations/utils/utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" -#include "fake_quantize.h" - using namespace dnnl; using namespace ov::element; @@ -39,25 +44,77 @@ namespace node { bool FullyConnected::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto fc = std::dynamic_pointer_cast(op); - if (!fc) { - errorMessage = "Only legacy FullyConnected operation is supported"; + if (!ov::is_type(op) && + !ov::is_type(op) && + !ov::is_type(op)) { return false; } - if (fc->get_input_size() == 3 && - std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(BIAS_ID)) == nullptr) { - errorMessage = "Only Constant operation on 'bias' input is supported"; + + if (ov::is_type(op)) { + if (!ov::op::util::is_on_constant_path(op->input_value(BIAS))) { + errorMessage = "Only Constant operation on 'bias' input is supported"; + return false; + } + } + + if (ov::is_type(op)) { + if (!ov::op::util::is_on_constant_path(op->input_value(WEIGHT_SCALES)) || + !ov::op::util::is_on_constant_path(op->input_value(WEIGHT_ZERO_POINTS))) { + errorMessage = + "Only Constant operation on 'weight scales', and 'weight zero points' inputs is supported"; + return false; + } + } + } catch (...) { + return false; + } + + return true; +} + +// @todo replace 'inferencePrecision' check with 'fc->get_input_element_type(0) == ov::element::bf16' +// after bf16 pipeline is moved to ConvertPrecision +bool FullyConnected::isSupportedCompressedOperation(const std::shared_ptr& op, + size_t IC, + size_t OC, + size_t G, + ov::element::Type inferencePrecision) noexcept { +#if defined(OPENVINO_ARCH_X86_64) + try { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) return false; + + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) + return false; + + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && + inferencePrecision == ov::element::bf16) { + // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a + // current solution conditions below are copied from OneDNN to make sure correct IP impl will be + // used since fallback one doesn't support weights decompression feature. + size_t simdWidth = 16; + size_t vnniFactor = 2; + size_t maxSize = 512; + auto amxRow = vnniFactor * simdWidth; + + if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) { + return false; + } } - const auto weightRank = fc->get_input_partial_shape(WEIGHTS_ID).size(); - if (weightRank != 2) { - errorMessage = "Doesn't support 'weight' input with rank: " + std::to_string(weightRank); + + if (IC % G != 0 || IC / G < 4 || OC == 1) { return false; } + + return true; } catch (...) { return false; } return true; +#else + return false; +#endif } void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context) { @@ -66,7 +123,7 @@ void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context) // init tp_cfg.w_rank and tp_cfg.w_size tp_cfg.w_rank = context->getCPUStreamExecutor()->get_rank()[0]; tp_cfg.w_size = ov::threading::message_manager()->get_num_sub_streams(); - tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1 ? true : false; + tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1; tp_cfg.sub_memory = context->getSubMemory(); } } @@ -79,6 +136,30 @@ FullyConnected::FullyConnected(const std::shared_ptr& op, const GraphC initTensorParallelConfig(context); if (!isSupportedOperation(op, errorMessage)) OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + + m_atoi[ARG_SRC] = DATA; + m_atoi[ARG_WEI] = WEIGHTS; + m_atoi[ARG_BIAS] = BIAS; + + auto mapArgToInput = [&op](std::unordered_map& argToInput, size_t argId, size_t inputId) { + if (op->get_input_size() > inputId && op->input(inputId).get_element_type() != ov::element::undefined) { + argToInput[argId] = inputId; + } + }; + + if (ov::is_type(op)) { + mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_SCALES, WEIGHT_SCALES); + mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_ZERO_POINTS, WEIGHT_ZERO_POINTS); + algorithm = Algorithm::FullyConnectedCompressed; + } else if (ov::is_type(op)) { + mapArgToInput(m_atoi, ARG_DST_DEQ_SCALE, 3); + algorithm = Algorithm::FullyConnectedQuantizedLegacy; + } else if (ov::is_type(op)) { + algorithm = Algorithm::FullyConnectedQuantized; + OPENVINO_THROW_NOT_IMPLEMENTED("FullyConnectedQuantized is not implemented yet"); + } else { + algorithm = Algorithm::FullyConnectedCommon; + } } bool FullyConnected::canBeExecutedInInt8() const { @@ -108,7 +189,8 @@ void FullyConnected::needPrepareParamsForTensorParallel() { dim += dims.size(); } OPENVINO_ASSERT(static_cast(dims[dim]) >= tp_cfg.w_size, - getName() + " dim[" + std::to_string(dim) + "] is " + std::to_string(dims[dim]) + ", which is larger than w_size " + std::to_string(tp_cfg.w_size)); + getName() + " dim[" + std::to_string(dim) + "] is " + std::to_string(dims[dim]) + + ", which is larger than w_size " + std::to_string(tp_cfg.w_size)); auto splited_dim_vec = split_parts(dims[dim], tp_cfg.w_size); VectorDims new_dims = std::move(dims); @@ -119,22 +201,18 @@ void FullyConnected::needPrepareParamsForTensorParallel() { } } -ExecutorPtr FullyConnected::createExecutor() { - const auto& executor = factory->make(memory); - getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType()); - - return executor; -} - void FullyConnected::prepareParams() { needPrepareParamsForTensorParallel(); - executor = createExecutor(); + + executor->update(memory); + // @todo avoid updating implementation type in scope of every prepareParams call + getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType()); } void FullyConnected::initTensorParallelSync() { if (tp_cfg.enable_tensor_parallel) { tp_cfg.id = tp_cfg.sub_memory->get_memory_id(tp_cfg.w_rank); - OPENVINO_ASSERT(tp_cfg.id > 0, "Tensor Parallel Config ID cannot be negative."); + OPENVINO_ASSERT(tp_cfg.id >= 0, "Tensor Parallel Config ID cannot be negative."); tp_cfg.sub_memory->set_memory_used(tp_cfg.id, tp_cfg.w_rank); while (true) { std::lock_guard lock(tp_cfg.sub_memory->_flagMutex); @@ -191,18 +269,34 @@ void FullyConnected::execTensorParallelSync() { for (int idx = 0; idx < tp_cfg.w_size; idx++) { if (wait_list[idx] > 0 && tp_cfg.sub_memory->_memorys_table[tp_cfg.id][idx].flag) { auto new_ptr = static_cast(tp_cfg.sub_memory->_memorys_table[tp_cfg.id][idx].send_buf); - const auto copySize = splited_dim_vec[idx] * prec.size(); // bytes of half selected dim. + const auto copySize = splited_dim_vec[idx] * prec.size(); // bytes of half selected dim. const size_t unloop = 8; size_t step = count / unloop; - parallel_for(step, [&](size_t i){ - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop) * channel_size, new_ptr + (i * unloop) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 1) * channel_size, new_ptr + (i * unloop + 1) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 2) * channel_size, new_ptr + (i * unloop + 2) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 3) * channel_size, new_ptr + (i * unloop + 3) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 4) * channel_size, new_ptr + (i * unloop + 4) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 5) * channel_size, new_ptr + (i * unloop + 5) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 6) * channel_size, new_ptr + (i * unloop + 6) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 7) * channel_size, new_ptr + (i * unloop + 7) * copySize, copySize); + parallel_for(step, [&](size_t i) { + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop) * channel_size, + new_ptr + (i * unloop) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 1) * channel_size, + new_ptr + (i * unloop + 1) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 2) * channel_size, + new_ptr + (i * unloop + 2) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 3) * channel_size, + new_ptr + (i * unloop + 3) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 4) * channel_size, + new_ptr + (i * unloop + 4) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 5) * channel_size, + new_ptr + (i * unloop + 5) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 6) * channel_size, + new_ptr + (i * unloop + 6) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 7) * channel_size, + new_ptr + (i * unloop + 7) * copySize, + copySize); }); size_t tail = count & ~(unloop - 1); for (size_t i = tail; i < count; ++i) { @@ -224,6 +318,7 @@ void FullyConnected::execTensorParallelSync() { } } } + void FullyConnected::execute(dnnl::stream strm) { initTensorParallelSync(); @@ -370,31 +465,11 @@ static bool useSparseWeightsDecompression(const NodePtr& weightsInput, return sparseRate >= minSparseRate; } -void FullyConnected::needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales) { - if (tp_cfg.enable_tensor_parallel) { - auto split_parts = [](int len, int n) { - int average = len / n; - std::vector parts(n, average); - parts.back() = len - average * (n - 1); - return parts; - }; - auto DQScales = getDQScales(); - auto split_lens = split_parts(DQScales.size(), tp_cfg.w_size); - auto split_offset = tp_cfg.w_rank * split_lens[0]; - std::vector newDQScales(split_lens[tp_cfg.w_rank]); - std::copy(DQScales.begin() + split_offset, DQScales.begin() + split_offset + split_lens[tp_cfg.w_rank], newDQScales.begin()); - dequantizationScales = std::move(newDQScales); - } -} - void FullyConnected::initSupportedPrimitiveDescriptors() { - attrs.withBias = getOriginalInputsNumber() == 3; - - attrs.dequantizationScales = getDQScales(); - needUpdateDQScaleForTensorParallel(attrs.dequantizationScales); + attrs.withBias = getOriginalInputPrecisionAtPort(BIAS) != ov::element::undefined; - attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS_ID)->getParent(), - getOriginalInputPrecisionAtPort(DATA_ID), + attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS)->getParent(), + getOriginalInputPrecisionAtPort(DATA), context->getConfig().fcSparseWeiDecompressionRate); attrs.dynamicQuantizationGroupSize = context->getConfig().fcDynamicQuantizationGroupSize; attrs.modelType = context->getConfig().modelType; @@ -410,6 +485,10 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { VecMemoryDescs srcDescs; const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); for (size_t i = 0; i < srcTypes.size(); i++) { + if (srcTypes[i] == element::undefined) { + srcDescs.push_back(MemoryDescUtils::makeEmptyDesc()); + continue; + } const auto srcDesc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(srcTypes[i], getInputShapeAtPort(i)); srcDescs.push_back(srcDesc); } @@ -421,23 +500,31 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { } MemoryDescArgs descs{ - {ARG_SRC, srcDescs[0]}, - {ARG_WEI, srcDescs[1]}, - {ARG_BIAS, attrs.withBias ? srcDescs[2] : MemoryDescUtils::makeEmptyDesc()}, + {ARG_SRC, srcDescs[DATA]}, + {ARG_WEI, srcDescs[WEIGHTS]}, + {ARG_BIAS, srcDescs[BIAS]}, {ARG_DST, dstDescs[0]}, }; - needUpdateScaleForTensorParallel(); - needUpdateZeroPointForTensorParallel(); - auto executionContext = std::make_shared(context, getImplPriority(), privateWeightCache); - factory = std::make_shared>(attrs, postOps, executionContext, descs); + factory = std::make_shared>(attrs, postOps, executionContext, descs); const auto nodeDescriptors = factory->getProperMemoryDescriptors(descs); NodeConfig nodeConfig; - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_SRC)); - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_WEI)); - if (attrs.withBias) nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_BIAS)); + nodeConfig.inConfs.resize(srcDescs.size()); + + for (const auto& desc : nodeDescriptors) { + if (m_atoi.count(desc.first)) { + nodeConfig.inConfs[m_atoi[desc.first]] = desc.second; + } + } + + // add extra inputs bypassing proper memory descriptors + // @todo pass all the input descriptors to getProperMemoryDescriptors and allow + // to ignore extra input descriptors if necessery + for (size_t i = 3; i < srcDescs.size(); i++) { + nodeConfig.inConfs[i] = srcDescs[i]; + } const int inPlace = canBeInPlace() ? 0 : -1; nodeConfig.outConfs.emplace_back(nodeDescriptors.at(ARG_DST), BlockedMemoryDesc::FULL_MASK, inPlace); @@ -447,19 +534,21 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { void FullyConnected::needSplitMemoryForTensorParallel() { if (tp_cfg.enable_tensor_parallel) { - auto src = getSrcMemoryAtPort(DATA_ID); - auto wgt = getSrcMemoryAtPort(WEIGHTS_ID); + auto src = getSrcMemoryAtPort(DATA); + auto wgt = getSrcMemoryAtPort(WEIGHTS); auto dst = getDstMemoryAtPort(0); // src - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); + memory[ARG_SRC] = getSrcMemoryAtPort(DATA); // wgt // split N direction - tp_cfg.cached_splited_weight = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size); + tp_cfg.cached_splited_weight = + attrs.weightsNonTransposed + ? split_vertical(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size); memory[ARG_WEI] = tp_cfg.cached_splited_weight; // bias if (attrs.withBias) { - auto bias = getSrcMemoryAtPort(BIAS_ID); + auto bias = getSrcMemoryAtPort(BIAS); auto select_bias = split_horizontal(context->getEngine(), std::move(bias), 0, tp_cfg.w_rank, tp_cfg.w_size); tp_cfg.cached_splited_bias = std::move(select_bias); } else { @@ -468,7 +557,28 @@ void FullyConnected::needSplitMemoryForTensorParallel() { memory[ARG_BIAS] = tp_cfg.cached_splited_bias; // dst memory[ARG_DST] = getDstMemoryAtPort(0); - tp_cfg.cached_dst = split_horizontal(context->getEngine(), std::move(dst), -1, tp_cfg.w_rank, tp_cfg.w_size, false); + tp_cfg.cached_dst = + split_horizontal(context->getEngine(), std::move(dst), -1, tp_cfg.w_rank, tp_cfg.w_size, false); + + memory[ARG_DST | ARG_ATTR_SCALES] = + split_horizontal(context->getEngine(), memory[ARG_DST | ARG_ATTR_SCALES], 0, tp_cfg.w_rank, tp_cfg.w_size); + + auto scale_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_SCALES]); + memory[ARG_WEI | ARG_ATTR_SCALES] = + attrs.weightsNonTransposed + ? split_vertical(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + + auto zeropoint_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_ZERO_POINTS]); + auto element_num = zeropoint_mem->getSize() / zeropoint_mem->getPrecision().size(); + if (element_num == 1) { + tp_cfg.cached_zeropoint = zeropoint_mem; + } else { + tp_cfg.cached_zeropoint = + attrs.weightsNonTransposed + ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + } } } @@ -477,7 +587,7 @@ void FullyConnected::needUpdateTensorParalelConfig() { // 1. weight shape is dynamic // 2. last dim can be splited. if (tp_cfg.enable_tensor_parallel) { - auto& shape = getSrcMemoryAtPort(WEIGHTS_ID)->getShape(); + auto& shape = getSrcMemoryAtPort(WEIGHTS)->getShape(); if (shape.isDynamic()) { tp_cfg.enable_tensor_parallel = false; } else if (shape.getDims()[0] < static_cast(tp_cfg.w_size)) { @@ -485,18 +595,22 @@ void FullyConnected::needUpdateTensorParalelConfig() { } } } + void FullyConnected::createPrimitive() { needUpdateTensorParalelConfig(); - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); - memory[ARG_WEI] = getSrcMemoryAtPort(WEIGHTS_ID); - memory[ARG_BIAS] = attrs.withBias ? getSrcMemoryAtPort(BIAS_ID) : MemoryDescUtils::makeEmptyMemory(context); + for (const auto& entry : m_atoi) { + const auto argumentId = entry.first; + const auto inputId = entry.second; + memory[argumentId] = getSrcMemoryAtPort(inputId); + } + memory[ARG_DST] = getDstMemoryAtPort(0); needSplitMemoryForTensorParallel(); // @todo should we preconfigure only for dynamic shapes? // Since for static shapes primitive is created in scope of compile_model() anyway - factory->preconfigure(memory); + executor = factory->make(memory); Node::createPrimitive(); } @@ -517,49 +631,6 @@ ov::element::Type FullyConnected::getRuntimePrecision() const { return getMaxPrecision(srcTypes); } -void FullyConnected::needUpdateScaleForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_scale) { - attrs.decompressionMultiplyPtr = tp_cfg.cached_scale; - } -} - -void FullyConnected::needSplitScaleForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_scale) { - auto scale_mem = std::const_pointer_cast(memory); - tp_cfg.cached_scale = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size); - } -} - -void FullyConnected::needUpdateZeroPointForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_zeropoint) { - attrs.decompressionSubtractPtr = tp_cfg.cached_zeropoint; - } -} - -void FullyConnected::needSplitZeroPointForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_zeropoint) { - auto zeropoint_mem = std::const_pointer_cast(memory); - auto element_num = memory->getSize() / memory->getPrecision().size(); - if (element_num == 1) { - tp_cfg.cached_zeropoint = std::move(zeropoint_mem); - } else { - tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); - } - } -} - -void FullyConnected::fuseDecompressionMultiply(const MemoryCPtr& memory) { - attrs.decompressionMultiplyPtr = memory; - needSplitScaleForTensorParallel(memory); -} - -void FullyConnected::fuseDecompressionSubtract(const MemoryCPtr& memory) { - attrs.decompressionSubtractPtr = memory; - needSplitZeroPointForTensorParallel(memory); -} - } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index be29342b851988..0b50d882c9e554 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -6,17 +6,18 @@ #include +#include #include #include #include +#include #include #include "cpu_memory.h" #include "nodes/executors/executor_factory.hpp" -#include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "nodes/executors/memory_arguments.hpp" #include "post_ops.hpp" -#include "openvino/runtime/threading/cpu_message.hpp" namespace ov { namespace intel_cpu { @@ -66,6 +67,15 @@ class FullyConnected : public Node { bool canFuse(const NodePtr& node) const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedCompressedOperation(const std::shared_ptr& op, + size_t IC, + size_t OC, + size_t G, + ov::element::Type inferencePrecision) noexcept; + + bool isExecutable() const override { + return !isInputTensorAtPortEmpty(0); + } void prepareParams() override; void executeDynamicImpl(dnnl::stream strm) override; @@ -81,11 +91,22 @@ class FullyConnected : public Node { void toNumaNodeImpl(int numaID) override; private: - static const size_t DATA_ID = 0; - static const size_t WEIGHTS_ID = 1; - static const size_t BIAS_ID = 2; + enum InputId : size_t { + DATA = 0, + WEIGHTS, + BIAS, + WEIGHT_SCALES, + WEIGHT_ZERO_POINTS, + INPUT_SCALES, + INPUT_ZERO_POINTS, + OUTPUT_SCALES, + OUTPUT_ZERO_POINTS, + }; + + static bool isConstantInput(const std::shared_ptr& op, InputId port); + + std::unordered_map m_atoi; // memory argument id to input id - ExecutorPtr createExecutor(); void fuseDecompressionConstant(const MemoryCPtr& memory, MemoryCPtr& decompressionValuesPtr); void initTensorParallelConfig(const GraphContext::CPtr context); @@ -94,16 +115,11 @@ class FullyConnected : public Node { void initTensorParallelSync(); void execTensorParallelSync(); void needSplitMemoryForTensorParallel(); - void needSplitScaleForTensorParallel(const MemoryCPtr& memory); - void needUpdateScaleForTensorParallel(); - void needSplitZeroPointForTensorParallel(const MemoryCPtr& memory); - void needUpdateZeroPointForTensorParallel(); - void needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales); FCAttrs attrs; PostOps postOps; MemoryArgs memory; - ExecutorFactoryPtr factory; + ExecutorFactoryPtr factory; ExecutorPtr executor = nullptr; std::string errorPrefix; diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index 6ee097e9a1fbab..c20a56807b0165 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -5,12 +5,13 @@ #pragma once #include -#include "kernels/x64/gather_uni_kernel.hpp" #include #include #include +#include "kernels/x64/gather_uni_kernel.hpp" + namespace ov { namespace intel_cpu { namespace node { @@ -19,7 +20,7 @@ class Gather : public Node { public: Gather(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void createPrimitive() override; void execute(dnnl::stream strm) override; @@ -115,6 +116,6 @@ class Gather : public Node { std::shared_ptr jitKernel; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_elements.cpp b/src/plugins/intel_cpu/src/nodes/gather_elements.cpp index 8653bda8c483d3..d8f221dcebf34d 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_elements.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather_elements.cpp @@ -2,23 +2,25 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "gather_elements.h" + #include -#include #include +#include + +#include "common/cpu_memcpy.h" #include "openvino/core/parallel.hpp" -#include "gather_elements.h" #include "openvino/opsets/opset1.hpp" #include "utils/general_utils.h" -#include "common/cpu_memcpy.h" namespace ov { namespace intel_cpu { namespace node { -bool GatherElements::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool GatherElements::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), - ov::op::v6::GatherElements::get_type_info_static())) { + if (!one_of(op->get_type_info(), ov::op::v6::GatherElements::get_type_info_static())) { errorMessage = "Node is not an instance of the GatherElements operation from operation set v6."; return false; } @@ -88,8 +90,7 @@ void GatherElements::initSupportedPrimitiveDescriptors() { dataTypeSize_ = inDataPrecision.size(); - addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, - {LayoutType::ncsp, ov::element::i32}}, + addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, {LayoutType::ncsp, ov::element::i32}}, {{LayoutType::ncsp, inDataPrecision}}, impl_desc_type::ref_any); } @@ -100,9 +101,9 @@ void GatherElements::executeDynamicImpl(dnnl::stream strm) { template void GatherElements::directExecution() { - const auto *srcData = getSrcDataAtPortAs(dataIndex_); - const auto *indices = getSrcDataAtPortAs(indicesIndex_); - auto *dstData = getDstDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(dataIndex_); + const auto* indices = getSrcDataAtPortAs(indicesIndex_); + auto* dstData = getDstDataAtPortAs(0); const int outSize = getChildEdgeAt(0)->getMemory().getShape().getElementsCount(); auto threadBody = [&](const int ithr, const int nthr) { @@ -133,14 +134,14 @@ void GatherElements::directExecution() { void GatherElements::execute(dnnl::stream strm) { switch (dataTypeSize_) { - case sizeof(element_type_traits::value_type): - return directExecution::value_type>(); - case sizeof(element_type_traits::value_type): - return directExecution::value_type>(); - case sizeof(element_type_traits::value_type): - return directExecution::value_type>(); - default: - OPENVINO_THROW("Unsupported data type size"); + case sizeof(element_type_traits::value_type): + return directExecution::value_type>(); + case sizeof(element_type_traits::value_type): + return directExecution::value_type>(); + case sizeof(element_type_traits::value_type): + return directExecution::value_type>(); + default: + OPENVINO_THROW("Unsupported data type size"); } } @@ -148,6 +149,6 @@ bool GatherElements::created() const { return getType() == Type::GatherElements; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_elements.h b/src/plugins/intel_cpu/src/nodes/gather_elements.h index 3c2282401f7431..b050cd4e523490 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_elements.h +++ b/src/plugins/intel_cpu/src/nodes/gather_elements.h @@ -14,7 +14,7 @@ class GatherElements : public Node { public: GatherElements(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -40,6 +40,6 @@ class GatherElements : public Node { void directExecution(); }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_nd.cpp b/src/plugins/intel_cpu/src/nodes/gather_nd.cpp index 8c81f9b770a687..e962839e571663 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_nd.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather_nd.cpp @@ -2,15 +2,17 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "gather_nd.h" + #include -#include +#include #include +#include + +#include "common/cpu_memcpy.h" #include "dnnl_types.h" #include "openvino/core/parallel.hpp" -#include "gather_nd.h" -#include #include "utils/general_utils.h" -#include "common/cpu_memcpy.h" #define THROW_ERROR(...) OPENVINO_THROW("GatherND layer with name '", getName(), "' ", __VA_ARGS__) @@ -20,7 +22,9 @@ namespace node { bool GatherND::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), ov::op::v5::GatherND::get_type_info_static(), ov::op::v8::GatherND::get_type_info_static())) { + if (!one_of(op->get_type_info(), + ov::op::v5::GatherND::get_type_info_static(), + ov::op::v8::GatherND::get_type_info_static())) { errorMessage = "Node is not an instance of the GatherND operation from operation set v5 and v8."; return false; } @@ -70,12 +74,16 @@ void GatherND::initSupportedPrimitiveDescriptors() { ov::element::Type indicesPrecision = getOriginalInputPrecisionAtPort(GATHERND_INDEXES); if (!one_of(indicesPrecision, - ov::element::i32, ov::element::i64, ov::element::i16, ov::element::u16, ov::element::i8, ov::element::u8)) { + ov::element::i32, + ov::element::i64, + ov::element::i16, + ov::element::u16, + ov::element::i8, + ov::element::u8)) { THROW_ERROR("has unsupported 'indices' input precision: ", indicesPrecision); } - addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, - {LayoutType::ncsp, ov::element::i32}}, + addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, {LayoutType::ncsp, ov::element::i32}}, {{LayoutType::ncsp, inDataPrecision}}, impl_desc_type::ref_any); } @@ -96,24 +104,33 @@ void GatherND::prepareParams() { attrs.srcDims = srcMemPtr->getStaticDims(); attrs.srcStrides = srcMemPtr->getDescWithType()->getStrides(); attrs.dstElementCount = dstMemPtr->getShape().getElementsCount(); - attrs.sliceRank = idxMemPtr->getStaticDims().back(); + attrs.sliceRank = idxMemPtr->getStaticDims().back(); execPtr = std::make_shared(attrs); } -GatherND::GatherNDExecutor::GatherNDExecutor(const GatherNDAttributes& attrs) : sliceRank(attrs.sliceRank), dataSize(attrs.dataSize) { - batchSize = std::accumulate(attrs.srcDims.begin(), attrs.srcDims.begin() + attrs.batchDims, size_t(1), std::multiplies()); - dataLength = std::accumulate(attrs.srcDims.begin() + sliceRank + attrs.batchDims, attrs.srcDims.end(), size_t(1), +GatherND::GatherNDExecutor::GatherNDExecutor(const GatherNDAttributes& attrs) + : sliceRank(attrs.sliceRank), + dataSize(attrs.dataSize) { + batchSize = std::accumulate(attrs.srcDims.begin(), + attrs.srcDims.begin() + attrs.batchDims, + size_t(1), + std::multiplies()); + dataLength = std::accumulate(attrs.srcDims.begin() + sliceRank + attrs.batchDims, + attrs.srcDims.end(), + size_t(1), std::multiplies()); cycles = attrs.dstElementCount / (dataLength * batchSize); workAmount = batchSize * cycles; - srcBatchStride = std::accumulate(attrs.srcDims.begin() + attrs.batchDims, attrs.srcDims.end(), size_t(1), + srcBatchStride = std::accumulate(attrs.srcDims.begin() + attrs.batchDims, + attrs.srcDims.end(), + size_t(1), std::multiplies()); idxBatchStride = cycles * sliceRank; dstBatchStride = cycles * dataLength; srcShifts.resize(attrs.sliceRank, 0); - for (size_t i = 0; i < attrs.sliceRank ; i++) + for (size_t i = 0; i < attrs.sliceRank; i++) srcShifts[i] = attrs.srcStrides[i + attrs.batchDims] * (dataLength > 1 ? dataSize : 1); // optimized implementation 'blocks' via memcpy @@ -128,25 +145,33 @@ void GatherND::execute(dnnl::stream strm) { if (!execPtr) THROW_ERROR("has not compiled executor."); - execPtr->exec(getSrcMemoryAtPort(GATHERND_DATA), - getSrcMemoryAtPort(GATHERND_INDEXES), - getDstMemoryAtPort(0)); + execPtr->exec(getSrcMemoryAtPort(GATHERND_DATA), getSrcMemoryAtPort(GATHERND_INDEXES), getDstMemoryAtPort(0)); } -void GatherND::GatherNDExecutor::exec(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { +void GatherND::GatherNDExecutor::exec(const MemoryPtr& srcMemPtr, + const MemoryPtr& idxMemPtr, + const MemoryPtr& dstMemPtr) { if (dataLength > 1) { gatherBlocks(srcMemPtr, idxMemPtr, dstMemPtr); return; } - GatherNDContext ctx { this, srcMemPtr, idxMemPtr, dstMemPtr }; - OV_SWITCH(intel_cpu, GatherNDEmitter, ctx, dataSize, - OV_CASE(sizeof(element_type_traits::value_type), element_type_traits::value_type), - OV_CASE(sizeof(element_type_traits::value_type), element_type_traits::value_type), - OV_CASE(sizeof(element_type_traits::value_type), element_type_traits::value_type)); + GatherNDContext ctx{this, srcMemPtr, idxMemPtr, dstMemPtr}; + OV_SWITCH(intel_cpu, + GatherNDEmitter, + ctx, + dataSize, + OV_CASE(sizeof(element_type_traits::value_type), + element_type_traits::value_type), + OV_CASE(sizeof(element_type_traits::value_type), + element_type_traits::value_type), + OV_CASE(sizeof(element_type_traits::value_type), + element_type_traits::value_type)); } -void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { +void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, + const MemoryPtr& idxMemPtr, + const MemoryPtr& dstMemPtr) { const uint8_t* srcData = srcMemPtr->getDataAs(); const int32_t* indices = idxMemPtr->getDataAs(); uint8_t* dstData = dstMemPtr->getDataAs(); @@ -183,7 +208,9 @@ void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, const } template -void GatherND::GatherNDExecutor::gatherElementwise(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { +void GatherND::GatherNDExecutor::gatherElementwise(const MemoryPtr& srcMemPtr, + const MemoryPtr& idxMemPtr, + const MemoryPtr& dstMemPtr) { const dataType* srcData = srcMemPtr->getDataAs(); const int32_t* indices = idxMemPtr->getDataAs(); dataType* dstData = dstMemPtr->getDataAs(); @@ -227,6 +254,6 @@ bool GatherND::created() const { return getType() == Type::GatherND; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_nd.h b/src/plugins/intel_cpu/src/nodes/gather_nd.h index ed643a2da08899..312cb465bf9e6c 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_nd.h +++ b/src/plugins/intel_cpu/src/nodes/gather_nd.h @@ -14,7 +14,7 @@ class GatherND : public Node { public: GatherND(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -65,7 +65,7 @@ class GatherND : public Node { MemoryPtr dstMemPtr; }; - template + template struct GatherNDEmitter { void operator()(GatherNDContext& ctx) { ctx.executor->gatherElementwise(ctx.srcMemPtr, ctx.idxMemPtr, ctx.dstMemPtr); @@ -80,6 +80,6 @@ class GatherND : public Node { executorPtr execPtr = nullptr; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_tree.cpp b/src/plugins/intel_cpu/src/nodes/gather_tree.cpp index 5834cd1e1048ba..2ff9a1ccdb8f59 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_tree.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather_tree.cpp @@ -2,13 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/gather_tree.hpp" + +#include #include #include -#include -#include "openvino/op/gather_tree.hpp" -#include "openvino/core/parallel.hpp" #include "gather_tree.h" +#include "openvino/core/parallel.hpp" #include "utils/general_utils.h" namespace ov { @@ -59,11 +60,11 @@ void GatherTree::initSupportedPrimitiveDescriptors() { if (!one_of(precision, ov::element::f32, ov::element::i32)) precision = ov::element::f32; - if (getOriginalInputPrecisionAtPort(GATHER_TREE_PARENT_IDX) != precision || + if (getOriginalInputPrecisionAtPort(GATHER_TREE_PARENT_IDX) != precision || getOriginalInputPrecisionAtPort(GATHER_TREE_MAX_SEQ_LEN) != precision || - getOriginalInputPrecisionAtPort(GATHER_TREE_END_TOKEN) != precision || - getOriginalOutputPrecisionAtPort(0) != precision) { - OPENVINO_THROW(errorPrefix, " has incorrect input/output data precision. Must be the same."); + getOriginalInputPrecisionAtPort(GATHER_TREE_END_TOKEN) != precision || + getOriginalOutputPrecisionAtPort(0) != precision) { + OPENVINO_THROW(errorPrefix, " has incorrect input/output data precision. Must be the same."); } addSupportedPrimDesc({{LayoutType::ncsp, precision}, @@ -121,13 +122,15 @@ void GatherTree::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -GatherTree::GatherTreeExecutor::GatherTreeExecutor(const VectorDims& stepIdxDims, const VectorDims& parentIdxDims, - const VectorDims& maxSeqLenDims, const VectorDims& dstDims) - : maxTime{static_cast(stepIdxDims[0])} - , batchSize{stepIdxDims[1]} - , beamWidth{stepIdxDims[2]} - , bbSize{batchSize * beamWidth} - , parentIdxSize{std::accumulate(parentIdxDims.cbegin(), parentIdxDims.cend(), 1lu, std::multiplies())} { +GatherTree::GatherTreeExecutor::GatherTreeExecutor(const VectorDims& stepIdxDims, + const VectorDims& parentIdxDims, + const VectorDims& maxSeqLenDims, + const VectorDims& dstDims) + : maxTime{static_cast(stepIdxDims[0])}, + batchSize{stepIdxDims[1]}, + beamWidth{stepIdxDims[2]}, + bbSize{batchSize * beamWidth}, + parentIdxSize{std::accumulate(parentIdxDims.cbegin(), parentIdxDims.cend(), 1lu, std::multiplies())} { if (maxTime != static_cast(parentIdxDims[0]) || maxTime != static_cast(dstDims[0]) || batchSize != parentIdxDims[1] || batchSize != dstDims[1] || batchSize != maxSeqLenDims[0] || beamWidth != parentIdxDims[2] || beamWidth != dstDims[2]) { @@ -136,14 +139,17 @@ GatherTree::GatherTreeExecutor::GatherTreeExecutor(const VectorDims& stepIdxDims } } -template -void GatherTree::GatherTreeExecutor::exec(const MemoryPtr& stepIdxMemPtr, const MemoryPtr& parentIdxMemPtr, - const MemoryPtr& maxSeqLenMemPtr, const MemoryPtr& endTokenMemPtr, const MemoryPtr& dstMemPtr) { - const auto *stepIdx = stepIdxMemPtr->getDataAs(); - const auto *parentIdx = parentIdxMemPtr->getDataAs(); - const auto *maxSeqLen = maxSeqLenMemPtr->getDataAs(); +template +void GatherTree::GatherTreeExecutor::exec(const MemoryPtr& stepIdxMemPtr, + const MemoryPtr& parentIdxMemPtr, + const MemoryPtr& maxSeqLenMemPtr, + const MemoryPtr& endTokenMemPtr, + const MemoryPtr& dstMemPtr) { + const auto* stepIdx = stepIdxMemPtr->getDataAs(); + const auto* parentIdx = parentIdxMemPtr->getDataAs(); + const auto* maxSeqLen = maxSeqLenMemPtr->getDataAs(); const auto endToken = (endTokenMemPtr->getDataAs())[0]; - auto *finalIdx = dstMemPtr->getDataAs(); + auto* finalIdx = dstMemPtr->getDataAs(); bool incorrectResult = false; parallel_for2d(batchSize, beamWidth, [&](size_t batch, size_t beam) { @@ -164,7 +170,7 @@ void GatherTree::GatherTreeExecutor::exec(const MemoryPtr& stepIdxMemPtr, const } bool finished = false; - auto *final = &finalIdx[batch * beamWidth + beam]; + auto* final = &finalIdx[batch * beamWidth + beam]; for (time = 0; time < maxSequenceInBeam; time++, final += bbSize) { if (finished) (*final) = endToken; @@ -184,6 +190,6 @@ bool GatherTree::created() const { return getType() == Type::GatherTree; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_tree.h b/src/plugins/intel_cpu/src/nodes/gather_tree.h index 69d63f834b555d..9874fceb835ba5 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_tree.h +++ b/src/plugins/intel_cpu/src/nodes/gather_tree.h @@ -14,7 +14,7 @@ class GatherTree : public Node { public: GatherTree(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -32,7 +32,7 @@ class GatherTree : public Node { const VectorDims& dstDims); ~GatherTreeExecutor() = default; - template + template void exec(const MemoryPtr& stepIdxMemPtr, const MemoryPtr& parentIdxMemPtr, const MemoryPtr& maxSeqLenMemPtr, @@ -60,6 +60,6 @@ class GatherTree : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp b/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp index ae32e1e4729096..0ed50c7b0d73a8 100644 --- a/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp +++ b/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp @@ -2,22 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include #include #include +#include #include -#include #include -#include +#include #if defined(HAVE_AVX2) -#include +# include #endif -#include "openvino/op/generate_proposals.hpp" -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" #include "generate_proposals.h" +#include "openvino/core/parallel.hpp" +#include "openvino/op/generate_proposals.hpp" #include "shape_inference/shape_inference_internal_dyn.hpp" namespace ov { @@ -30,21 +30,29 @@ struct Indexer4d { int dim23_; int dim123_; - explicit Indexer4d(int dim0, int dim1, int dim2, int dim3): - dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) { + explicit Indexer4d(int dim0, int dim1, int dim2, int dim3) + : dim3_(dim3), + dim23_(dim2 * dim3), + dim123_(dim1 * dim2 * dim3) { (void)dim0; } int operator()(int i, int j, int k, int n) const { - return i * dim123_ + j * dim23_ + k * dim3_ + n; + return i * dim123_ + j * dim23_ + k * dim3_ + n; } }; - -void refine_anchors(const float* deltas, const float* scores, const float* anchors, - float* proposals, const int anchors_num, const int bottom_H, - const int bottom_W, const float img_H, const float img_W, - const float min_box_H, const float min_box_W, +void refine_anchors(const float* deltas, + const float* scores, + const float* anchors, + float* proposals, + const int anchors_num, + const int bottom_H, + const int bottom_W, + const float img_H, + const float img_W, + const float min_box_H, + const float min_box_W, const float max_delta_log_wh, float coordinates_offset) { Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W); @@ -111,18 +119,23 @@ void refine_anchors(const float* deltas, const float* scores, const float* ancho void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int* is_dead, int pre_nms_topn) { parallel_for(pre_nms_topn, [&](size_t i) { - unpacked_boxes[0*pre_nms_topn + i] = p_proposals[6*i + 0]; - unpacked_boxes[1*pre_nms_topn + i] = p_proposals[6*i + 1]; - unpacked_boxes[2*pre_nms_topn + i] = p_proposals[6*i + 2]; - unpacked_boxes[3*pre_nms_topn + i] = p_proposals[6*i + 3]; - unpacked_boxes[4*pre_nms_topn + i] = p_proposals[6*i + 4]; - is_dead[i] = (p_proposals[6*i + 5] == 1.0) ? 0 : 1; + unpacked_boxes[0 * pre_nms_topn + i] = p_proposals[6 * i + 0]; + unpacked_boxes[1 * pre_nms_topn + i] = p_proposals[6 * i + 1]; + unpacked_boxes[2 * pre_nms_topn + i] = p_proposals[6 * i + 2]; + unpacked_boxes[3 * pre_nms_topn + i] = p_proposals[6 * i + 3]; + unpacked_boxes[4 * pre_nms_topn + i] = p_proposals[6 * i + 4]; + is_dead[i] = (p_proposals[6 * i + 5] == 1.0) ? 0 : 1; }); } -void nms_cpu(const int num_boxes, int is_dead[], - const float* boxes, int index_out[], size_t* const num_out, - const int base_index, const float nms_thresh, const int max_num_out, +void nms_cpu(const int num_boxes, + int is_dead[], + const float* boxes, + int index_out[], + size_t* const num_out, + const int base_index, + const float nms_thresh, + const int max_num_out, float coordinates_offset) { const int num_proposals = num_boxes; size_t count = 0; @@ -133,9 +146,9 @@ void nms_cpu(const int num_boxes, int is_dead[], const float* y1 = boxes + 3 * num_proposals; #if defined(HAVE_AVX2) - __m256 vc_fone = _mm256_set1_ps(coordinates_offset); + __m256 vc_fone = _mm256_set1_ps(coordinates_offset); __m256i vc_ione = _mm256_set1_epi32(1); - __m256 vc_zero = _mm256_set1_ps(0.0f); + __m256 vc_zero = _mm256_set1_ps(0.0f); __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh); #endif @@ -156,13 +169,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1i = _mm256_set1_ps(x1[box]); __m256 vy1i = _mm256_set1_ps(y1[box]); - __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); + __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); __m256 vA_height = _mm256_sub_ps(vy1i, vy0i); - __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); + __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); for (; tail <= num_boxes - 8; tail += 8) { - __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail); - __m256i vdst = _mm256_loadu_si256(pdst); + __m256i* pdst = reinterpret_cast<__m256i*>(is_dead + tail); + __m256i vdst = _mm256_loadu_si256(pdst); __m256 vx0j = _mm256_loadu_ps(x0 + tail); __m256 vy0j = _mm256_loadu_ps(y0 + tail); @@ -174,13 +187,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1 = _mm256_min_ps(vx1i, vx1j); __m256 vy1 = _mm256_min_ps(vy1i, vy1j); - __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); + __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone); __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight)); - __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); + __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); __m256 vB_height = _mm256_sub_ps(vy1j, vy0j); - __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); + __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea); __m256 vintersection_area = _mm256_div_ps(varea, vdivisor); @@ -221,9 +234,9 @@ void nms_cpu(const int num_boxes, int is_dead[], const float y1 = std::min(y1i, y1j); // intersection area - const float width = std::max(0.0f, x1 - x0 + coordinates_offset); - const float height = std::max(0.0f, y1 - y0 + coordinates_offset); - const float area = width * height; + const float width = std::max(0.0f, x1 - x0 + coordinates_offset); + const float height = std::max(0.0f, y1 - y0 + coordinates_offset); + const float area = width * height; // area of A, B const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset); @@ -241,16 +254,20 @@ void nms_cpu(const int num_boxes, int is_dead[], *num_out = count; } - -void fill_output_blobs(const float* proposals, const int* roi_indices, - float* rois, float* scores, uint8_t* roi_num, - const int num_proposals, const size_t num_rois, const int post_nms_topn, +void fill_output_blobs(const float* proposals, + const int* roi_indices, + float* rois, + float* scores, + uint8_t* roi_num, + const int num_proposals, + const size_t num_rois, + const int post_nms_topn, ov::element::Type roi_num_type) { - const float *src_x0 = proposals + 0 * num_proposals; - const float *src_y0 = proposals + 1 * num_proposals; - const float *src_x1 = proposals + 2 * num_proposals; - const float *src_y1 = proposals + 3 * num_proposals; - const float *src_score = proposals + 4 * num_proposals; + const float* src_x0 = proposals + 0 * num_proposals; + const float* src_y0 = proposals + 1 * num_proposals; + const float* src_x1 = proposals + 2 * num_proposals; + const float* src_y1 = proposals + 3 * num_proposals; + const float* src_score = proposals + 4 * num_proposals; parallel_for(num_rois, [&](size_t i) { int index = roi_indices[i]; @@ -274,8 +291,8 @@ void fill_output_blobs(const float* proposals, const int* roi_indices, } // namespace -bool GenerateProposals::isSupportedOperation - (const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool GenerateProposals::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (!ov::as_type_ptr(op)) { errorMessage = "Node is not an instance of the Proposal from the operations set v0."; @@ -332,13 +349,13 @@ void GenerateProposals::execute(dnnl::stream strm) { } size_t anchor_dims_size = 1; - const auto &anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); + const auto& anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); for (size_t i = 0; i < anchorDims.size(); i++) { anchor_dims_size *= anchorDims[i]; } size_t deltas_dims_size = 1; - const auto &deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); + const auto& deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); for (size_t i = 1; i < deltaDims.size(); i++) { deltas_dims_size *= deltaDims[i]; } @@ -346,7 +363,7 @@ void GenerateProposals::execute(dnnl::stream strm) { OPENVINO_THROW("'Anchors' blob size for GenerateProposals is incompatible with 'deltas' blob size!"); size_t score_dims_size = 1; - const auto &scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); + const auto& scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); for (size_t i = 1; i < scoreDims.size(); i++) { score_dims_size *= scoreDims[i]; } @@ -354,16 +371,16 @@ void GenerateProposals::execute(dnnl::stream strm) { OPENVINO_THROW("'Deltas' blob size for GenerateProposals is incompatible with 'scores' blob size!"); size_t im_info_dims_size = 1; - const auto &infoDims = getParentEdgeAt(INPUT_IM_INFO)->getMemory().getStaticDims(); + const auto& infoDims = getParentEdgeAt(INPUT_IM_INFO)->getMemory().getStaticDims(); for (size_t i = 1; i < infoDims.size(); i++) { im_info_dims_size *= infoDims[i]; } // Prepare memory - const float *p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); - const float *p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); - const float *p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); - const float *p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); + const float* p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); + const float* p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); + const float* p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); + const float* p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); const int anchors_num = scoreDims[1]; @@ -422,27 +439,50 @@ void GenerateProposals::execute(dnnl::stream strm) { const float min_box_H = min_size_ * scale_h; const float min_box_W = min_size_ * scale_w; - refine_anchors(p_deltas_item, p_scores_item, p_anchors_item, - reinterpret_cast(&proposals_[0]), anchors_num, bottom_H, - bottom_W, img_H, img_W, - min_box_H, min_box_W, + refine_anchors(p_deltas_item, + p_scores_item, + p_anchors_item, + reinterpret_cast(&proposals_[0]), + anchors_num, + bottom_H, + bottom_W, + img_H, + img_W, + min_box_H, + min_box_W, static_cast(std::log(1000. / 16.)), coordinates_offset_); - std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), - [](const ProposalBox &struct1, const ProposalBox &struct2) { + std::partial_sort(proposals_.begin(), + proposals_.begin() + pre_nms_topn, + proposals_.end(), + [](const ProposalBox& struct1, const ProposalBox& struct2) { return (struct1.score > struct2.score); }); - unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], &is_dead[0], pre_nms_topn); - nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, - nms_thresh_, post_nms_topn_, coordinates_offset_); + unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], &is_dead[0], pre_nms_topn); + nms_cpu(pre_nms_topn, + &is_dead[0], + &unpacked_boxes[0], + &roi_indices_[0], + &num_rois, + 0, + nms_thresh_, + post_nms_topn_, + coordinates_offset_); size_t new_num_rois = total_num_rois + num_rois; roi_item.resize(new_num_rois * 4); score_item.resize(new_num_rois); - fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], &roi_item[total_num_rois * 4], &score_item[total_num_rois], - p_roi_num, pre_nms_topn, num_rois, post_nms_topn_, roi_num_type); + fill_output_blobs(&unpacked_boxes[0], + &roi_indices_[0], + &roi_item[total_num_rois * 4], + &score_item[total_num_rois], + p_roi_num, + pre_nms_topn, + num_rois, + post_nms_topn_, + roi_num_type); p_deltas_item += deltas_dims_size; p_scores_item += score_dims_size; p_img_info_cpu += im_info_dims_size; @@ -451,13 +491,13 @@ void GenerateProposals::execute(dnnl::stream strm) { } // copy to out memory redefineOutputMemory({VectorDims{total_num_rois, 4}, VectorDims{total_num_rois}, VectorDims{batch_size}}); - float *p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); - float *p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); + float* p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); + float* p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); uint8_t* p_roi_num_item = getDstDataAtPortAs(OUTPUT_ROI_NUM); memcpy(p_roi_item, &roi_item[0], roi_item.size() * sizeof(float)); memcpy(p_roi_score_item, &score_item[0], score_item.size() * sizeof(float)); memcpy(p_roi_num_item, &roi_num[0], getDstMemoryAtPort(OUTPUT_ROI_NUM)->getSize()); - } catch (const std::exception &e) { + } catch (const std::exception& e) { std::string errorMsg = e.what(); OPENVINO_THROW(errorMsg); } @@ -475,6 +515,6 @@ bool GenerateProposals::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/generate_proposals.h b/src/plugins/intel_cpu/src/nodes/generate_proposals.h index 5438f30011d986..666338eed3d4aa 100644 --- a/src/plugins/intel_cpu/src/nodes/generate_proposals.h +++ b/src/plugins/intel_cpu/src/nodes/generate_proposals.h @@ -14,7 +14,7 @@ class GenerateProposals : public Node { public: GenerateProposals(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -35,13 +35,13 @@ class GenerateProposals : public Node { // scores, shape [rois_num] // roi_num, shape [N] - const int INPUT_IM_INFO {0}; - const int INPUT_ANCHORS {1}; - const int INPUT_DELTAS {2}; - const int INPUT_SCORES {3}; - const int OUTPUT_ROIS {0}; - const int OUTPUT_SCORES {1}; - const int OUTPUT_ROI_NUM {2}; + const int INPUT_IM_INFO{0}; + const int INPUT_ANCHORS{1}; + const int INPUT_DELTAS{2}; + const int INPUT_SCORES{3}; + const int OUTPUT_ROIS{0}; + const int OUTPUT_SCORES{1}; + const int OUTPUT_ROI_NUM{2}; float min_size_ = 0.f; int pre_nms_topn_ = 0; @@ -52,6 +52,6 @@ class GenerateProposals : public Node { std::vector roi_indices_; }; -} // namespace node +} // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp index c8b73360539b68..9f346a2db14dac 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp @@ -3,15 +3,16 @@ // #include "grid_sample.hpp" -#include "openvino/op/grid_sample.hpp" + #include "openvino/core/parallel.hpp" +#include "openvino/op/grid_sample.hpp" using namespace ov::intel_cpu; using namespace ov::intel_cpu::node; #if defined(OPENVINO_ARCH_X86_64) using namespace dnnl::impl::cpu; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 #define THROW_ERROR(...) OPENVINO_THROW(getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) @@ -28,7 +29,7 @@ bool GridSample::isSupportedOperation(const std::shared_ptr& op, } #else return false; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 } catch (...) { return false; } @@ -61,30 +62,30 @@ GridSample::GridSample(const std::shared_ptr& op, const GraphContext:: const auto& attributes = ov::as_type_ptr(op)->get_attributes(); alignCorners = attributes.align_corners; switch (attributes.mode) { - case op::v9::GridSample::InterpolationMode::BILINEAR: - interpolationMode = GridSampleInterpolationMode::BILINEAR; - break; - case op::v9::GridSample::InterpolationMode::BICUBIC: - interpolationMode = GridSampleInterpolationMode::BICUBIC; - break; - case op::v9::GridSample::InterpolationMode::NEAREST: - interpolationMode = GridSampleInterpolationMode::NEAREST; - break; - default: - THROW_CPU_NODE_ERR("supports only BILINEAR, BICUBIC, NEAREST interpolation modes."); + case op::v9::GridSample::InterpolationMode::BILINEAR: + interpolationMode = GridSampleInterpolationMode::BILINEAR; + break; + case op::v9::GridSample::InterpolationMode::BICUBIC: + interpolationMode = GridSampleInterpolationMode::BICUBIC; + break; + case op::v9::GridSample::InterpolationMode::NEAREST: + interpolationMode = GridSampleInterpolationMode::NEAREST; + break; + default: + THROW_CPU_NODE_ERR("supports only BILINEAR, BICUBIC, NEAREST interpolation modes."); } switch (attributes.padding_mode) { - case op::v9::GridSample::PaddingMode::ZEROS: - paddingMode = GridSamplePaddingMode::ZEROS; - break; - case op::v9::GridSample::PaddingMode::BORDER: - paddingMode = GridSamplePaddingMode::BORDER; - break; - case op::v9::GridSample::PaddingMode::REFLECTION: - paddingMode = GridSamplePaddingMode::REFLECTION; - break; - default: - THROW_CPU_NODE_ERR("supports only BORDER, REFLECTION, ZEROS paddings modes."); + case op::v9::GridSample::PaddingMode::ZEROS: + paddingMode = GridSamplePaddingMode::ZEROS; + break; + case op::v9::GridSample::PaddingMode::BORDER: + paddingMode = GridSamplePaddingMode::BORDER; + break; + case op::v9::GridSample::PaddingMode::REFLECTION: + paddingMode = GridSamplePaddingMode::REFLECTION; + break; + default: + THROW_CPU_NODE_ERR("supports only BORDER, REFLECTION, ZEROS paddings modes."); } } @@ -107,8 +108,7 @@ void GridSample::initSupportedPrimitiveDescriptors() { } // 95905 - to add nspc layout support. - addSupportedPrimDesc({{LayoutType::ncsp, dataPrecision}, - {LayoutType::ncsp, gridPrecision}}, + addSupportedPrimDesc({{LayoutType::ncsp, dataPrecision}, {LayoutType::ncsp, gridPrecision}}, {{LayoutType::ncsp, dataPrecision}}, implType); } @@ -116,25 +116,26 @@ void GridSample::initSupportedPrimitiveDescriptors() { void GridSample::createPrimitive() { kernel::GridSampleKernelConfParams jcp; - jcp.inDataPrc = dataPrecision; - jcp.gridPrc = gridPrecision; + jcp.inDataPrc = dataPrecision; + jcp.gridPrc = gridPrecision; jcp.dynamicShapes = isDynamicNode(); - jcp.alignCorners = alignCorners; + jcp.alignCorners = alignCorners; jcp.interpolationMode = interpolationMode; - jcp.paddingMode = paddingMode; + jcp.paddingMode = paddingMode; const auto& srcDataDims = getInputShapeAtPort(IN_DATA).getDims(); if (!jcp.dynamicShapes) { - jcp.batchNum = srcDataDims[0]; - jcp.cannelNum = srcDataDims[1]; - jcp.dynamicBatch = false; + jcp.batchNum = srcDataDims[0]; + jcp.cannelNum = srcDataDims[1]; + jcp.dynamicBatch = false; jcp.dynamicChannel = false; - jcp.srcBatchStepB = std::accumulate(srcDataDims.begin() + 1, srcDataDims.end(), dataTypeSize, std::multiplies()); + jcp.srcBatchStepB = + std::accumulate(srcDataDims.begin() + 1, srcDataDims.end(), dataTypeSize, std::multiplies()); } else { - jcp.dynamicBatch = srcDataDims[0] == Shape::UNDEFINED_DIM; - jcp.batchNum = jcp.dynamicBatch ? 1lu : srcDataDims[0]; + jcp.dynamicBatch = srcDataDims[0] == Shape::UNDEFINED_DIM; + jcp.batchNum = jcp.dynamicBatch ? 1lu : srcDataDims[0]; jcp.dynamicChannel = srcDataDims[1] == Shape::UNDEFINED_DIM; - jcp.cannelNum = jcp.dynamicChannel ? 1lu : srcDataDims[1]; + jcp.cannelNum = jcp.dynamicChannel ? 1lu : srcDataDims[1]; } if (x64::mayiuse(x64::avx512_core)) { @@ -195,7 +196,7 @@ void GridSample::prepareParams() { const uint64_t dataElPerVec = jitKernel->getDataElPerVec(); const auto& srcDataShape = dataMemPtr->getStaticDims(); - const auto& dstShape = dstMemPtr->getStaticDims(); + const auto& dstShape = dstMemPtr->getStaticDims(); const uint64_t totalWork = dstShape[2] * dstShape[3]; const uint64_t wpt = ((totalWork / dataElPerVec) / m_threads_num + 1) * dataElPerVec; @@ -210,26 +211,27 @@ void GridSample::prepareParams() { return; } - p.batchNum = srcDataShape[0]; - p.channelsNum = srcDataShape[1]; + p.batchNum = srcDataShape[0]; + p.channelsNum = srcDataShape[1]; p.srcHeightF[0] = srcDataShape[2]; - p.srcWidthF[0] = srcDataShape[3]; + p.srcWidthF[0] = srcDataShape[3]; p.gridStartB = dstStart * 2 * gridTypeSize; - p.dstStartB = dstStart * dataTypeSize; + p.dstStartB = dstStart * dataTypeSize; - p.srcBatchStepB = std::accumulate(srcDataShape.begin() + 1, srcDataShape.end(), dataTypeSize, std::multiplies()); + p.srcBatchStepB = + std::accumulate(srcDataShape.begin() + 1, srcDataShape.end(), dataTypeSize, std::multiplies()); p.gridBatchStepB = (dstShape[2] * dstShape[3] - p.workAmount) * 2 * gridTypeSize; - p.dstBatchStepB = (dstShape[1] * dstShape[2] * dstShape[3] - p.workAmount) * dataTypeSize; + p.dstBatchStepB = (dstShape[1] * dstShape[2] * dstShape[3] - p.workAmount) * dataTypeSize; p.srcChannelStepB = srcDataShape[2] * srcDataShape[3] * dataTypeSize; p.dstChannelStepB = dstShape[2] * dstShape[3] * dataTypeSize; p.dataTypeSize[0] = dataTypeSize; p.srcHeightSub1F[0] = p.srcHeightF[0] - 1.f; - p.srcWidthSub1F[0] = p.srcWidthF[0] - 1.f; + p.srcWidthSub1F[0] = p.srcWidthF[0] - 1.f; p.srcHeightMul2F[0] = p.srcHeightF[0] * 2.f; - p.srcWidthMul2F[0] = p.srcWidthF[0] * 2.f; + p.srcWidthMul2F[0] = p.srcWidthF[0] * 2.f; if (interpolationMode == GridSampleInterpolationMode::BICUBIC && srcDataShape[3] >= 4) { p.srcWidthB[0] = (srcDataShape[3] - 3) * dataTypeSize; } else { @@ -237,24 +239,24 @@ void GridSample::prepareParams() { } if (alignCorners) { p.srcHeightMul2Sub1F[0] = p.srcHeightF[0] == 1.f ? 1.f : p.srcHeightSub1F[0] * 2.f; - p.srcWidthMul2Sub1F[0] = p.srcWidthF[0] == 1.f ? 1.f : p.srcWidthSub1F[0] * 2.f; - p.wDenormCoefF[0] = (p.srcWidthF[0] - 1.f) / 2.f; + p.srcWidthMul2Sub1F[0] = p.srcWidthF[0] == 1.f ? 1.f : p.srcWidthSub1F[0] * 2.f; + p.wDenormCoefF[0] = (p.srcWidthF[0] - 1.f) / 2.f; p.hDenormCoefF[0] = (p.srcHeightF[0] - 1.f) / 2.f; } else { p.srcHeightMul2Sub1F[0] = p.srcHeightMul2F[0] - 1.f; - p.srcWidthMul2Sub1F[0] = p.srcWidthMul2F[0] - 1.f; + p.srcWidthMul2Sub1F[0] = p.srcWidthMul2F[0] - 1.f; } if (!x64::mayiuse(x64::avx512_core)) { - std::fill(p.srcHeightF.begin(), p.srcHeightF.end(), p.srcHeightF[0]); - std::fill(p.srcWidthF.begin(), p.srcWidthF.end(), p.srcWidthF[0]); - std::fill(p.dataTypeSize.begin(), p.dataTypeSize.end(), p.dataTypeSize[0]); - std::fill(p.srcHeightSub1F.begin(), p.srcHeightSub1F.end(), p.srcHeightSub1F[0]); - std::fill(p.srcWidthSub1F.begin(), p.srcWidthSub1F.end(), p.srcWidthSub1F[0]); - std::fill(p.srcHeightMul2F.begin(), p.srcHeightMul2F.end(), p.srcHeightMul2F[0]); - std::fill(p.srcWidthMul2F.begin(), p.srcWidthMul2F.end(), p.srcWidthMul2F[0]); - std::fill(p.srcWidthB.begin(), p.srcWidthB.end(), p.srcWidthB[0]); + std::fill(p.srcHeightF.begin(), p.srcHeightF.end(), p.srcHeightF[0]); + std::fill(p.srcWidthF.begin(), p.srcWidthF.end(), p.srcWidthF[0]); + std::fill(p.dataTypeSize.begin(), p.dataTypeSize.end(), p.dataTypeSize[0]); + std::fill(p.srcHeightSub1F.begin(), p.srcHeightSub1F.end(), p.srcHeightSub1F[0]); + std::fill(p.srcWidthSub1F.begin(), p.srcWidthSub1F.end(), p.srcWidthSub1F[0]); + std::fill(p.srcHeightMul2F.begin(), p.srcHeightMul2F.end(), p.srcHeightMul2F[0]); + std::fill(p.srcWidthMul2F.begin(), p.srcWidthMul2F.end(), p.srcWidthMul2F[0]); + std::fill(p.srcWidthB.begin(), p.srcWidthB.end(), p.srcWidthB[0]); std::fill(p.srcHeightMul2Sub1F.begin(), p.srcHeightMul2Sub1F.end(), p.srcHeightMul2Sub1F[0]); - std::fill(p.srcWidthMul2Sub1F.begin(), p.srcWidthMul2Sub1F.end(), p.srcWidthMul2Sub1F[0]); + std::fill(p.srcWidthMul2Sub1F.begin(), p.srcWidthMul2Sub1F.end(), p.srcWidthMul2Sub1F[0]); if (alignCorners) { std::fill(p.wDenormCoefF.begin(), p.wDenormCoefF.end(), p.wDenormCoefF[0]); std::fill(p.hDenormCoefF.begin(), p.hDenormCoefF.end(), p.hDenormCoefF[0]); @@ -264,9 +266,9 @@ void GridSample::prepareParams() { } void GridSample::execute(dnnl::stream strm) { - const void* srcData = getSrcDataAtPort(IN_DATA); + const void* srcData = getSrcDataAtPort(IN_DATA); const uint8_t* gridData = getSrcDataAtPortAs(IN_GRID); - uint8_t* dstData = getDstDataAtPortAs(0); + uint8_t* dstData = getDstDataAtPortAs(0); auto threadBody = [&](const int ithr, const int nthr) { const auto& p = execParamsPerThread[ithr]; @@ -275,30 +277,30 @@ void GridSample::execute(dnnl::stream strm) { return; } - arg.src = srcData; - arg.grid = gridData + p.gridStartB; - arg.dst = dstData + p.dstStartB; - arg.batchNum = p.batchNum; - arg.channelsNum = p.channelsNum; - arg.srcHeightF = p.srcHeightF.data(); - arg.srcWidthF = p.srcWidthF.data(); - arg.srcWidthB = p.srcWidthB.data(); - arg.srcChannelStepB = p.srcChannelStepB; - arg.dstChannelStepB = p.dstChannelStepB; - arg.srcBatchStepB = p.srcBatchStepB; - arg.gridBatchStepB = p.gridBatchStepB; - arg.dstBatchStepB = p.dstBatchStepB; - arg.srcHeightSub1F = p.srcHeightSub1F.data(); - arg.srcWidthSub1F = p.srcWidthSub1F.data(); - arg.srcWidthMul2F = p.srcWidthMul2F.data(); - arg.srcHeightMul2F = p.srcHeightMul2F.data(); + arg.src = srcData; + arg.grid = gridData + p.gridStartB; + arg.dst = dstData + p.dstStartB; + arg.batchNum = p.batchNum; + arg.channelsNum = p.channelsNum; + arg.srcHeightF = p.srcHeightF.data(); + arg.srcWidthF = p.srcWidthF.data(); + arg.srcWidthB = p.srcWidthB.data(); + arg.srcChannelStepB = p.srcChannelStepB; + arg.dstChannelStepB = p.dstChannelStepB; + arg.srcBatchStepB = p.srcBatchStepB; + arg.gridBatchStepB = p.gridBatchStepB; + arg.dstBatchStepB = p.dstBatchStepB; + arg.srcHeightSub1F = p.srcHeightSub1F.data(); + arg.srcWidthSub1F = p.srcWidthSub1F.data(); + arg.srcWidthMul2F = p.srcWidthMul2F.data(); + arg.srcHeightMul2F = p.srcHeightMul2F.data(); arg.srcHeightMul2Sub1F = p.srcHeightMul2Sub1F.data(); - arg.srcWidthMul2Sub1F = p.srcWidthMul2Sub1F.data(); - arg.wDenormCoefF = p.wDenormCoefF.data(); - arg.hDenormCoefF = p.hDenormCoefF.data(); - arg.dataTypeSize = p.dataTypeSize.data(); - arg.buffer = p.buffer.data(); - arg.workAmount = p.workAmount; + arg.srcWidthMul2Sub1F = p.srcWidthMul2Sub1F.data(); + arg.wDenormCoefF = p.wDenormCoefF.data(); + arg.hDenormCoefF = p.hDenormCoefF.data(); + arg.dataTypeSize = p.dataTypeSize.data(); + arg.buffer = p.buffer.data(); + arg.workAmount = p.workAmount; (*jitKernel)(&arg); }; @@ -314,4 +316,4 @@ bool GridSample::created() const { return getType() == Type::GridSample; } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp index b4468d58be9b52..eb4fd38b64c878 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp @@ -5,6 +5,7 @@ #pragma once #include + #include "kernels/x64/grid_sample.hpp" namespace ov { @@ -16,35 +17,35 @@ class GridSample : public Node { GridSample(const std::shared_ptr& op, const GraphContext::CPtr context); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void createPrimitive() override; void execute(dnnl::stream strm) override; bool created() const override; struct threadExecParams { - uint64_t batchNum = 1lu; + uint64_t batchNum = 1lu; uint64_t channelsNum = 1lu; - std::vector srcHeightF{ 1.f }; - std::vector srcWidthF{ 1.f }; - std::vector srcWidthB{ 1lu }; - std::vector dataTypeSize{ 1lu }; - std::vector srcHeightMul2F{ 1.f }; - std::vector srcWidthMul2F{ 1.f }; - std::vector srcHeightMul2Sub1F{ 1.f }; - std::vector srcWidthMul2Sub1F{ 1.f }; - std::vector srcHeightSub1F{ 1.f }; - std::vector srcWidthSub1F{ 1.f }; - std::vector wDenormCoefF{ 1.f }; - std::vector hDenormCoefF{ 1.f }; - uint64_t gridStartB = 0lu; - uint64_t dstStartB = 0lu; + std::vector srcHeightF{1.f}; + std::vector srcWidthF{1.f}; + std::vector srcWidthB{1lu}; + std::vector dataTypeSize{1lu}; + std::vector srcHeightMul2F{1.f}; + std::vector srcWidthMul2F{1.f}; + std::vector srcHeightMul2Sub1F{1.f}; + std::vector srcWidthMul2Sub1F{1.f}; + std::vector srcHeightSub1F{1.f}; + std::vector srcWidthSub1F{1.f}; + std::vector wDenormCoefF{1.f}; + std::vector hDenormCoefF{1.f}; + uint64_t gridStartB = 0lu; + uint64_t dstStartB = 0lu; uint64_t srcChannelStepB = 0lu; uint64_t dstChannelStepB = 0lu; - uint64_t srcBatchStepB = 0lu; - uint64_t gridBatchStepB = 0lu; - uint64_t dstBatchStepB = 0lu; - uint64_t workAmount = 0lu; + uint64_t srcBatchStepB = 0lu; + uint64_t gridBatchStepB = 0lu; + uint64_t dstBatchStepB = 0lu; + uint64_t workAmount = 0lu; std::vector buffer; }; @@ -71,6 +72,6 @@ class GridSample : public Node { std::shared_ptr jitKernel; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/grn.cpp b/src/plugins/intel_cpu/src/nodes/grn.cpp index 10de2ef2286f0f..374452812eaf3a 100644 --- a/src/plugins/intel_cpu/src/nodes/grn.cpp +++ b/src/plugins/intel_cpu/src/nodes/grn.cpp @@ -2,11 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "grn.h" + #include -#include "openvino/opsets/opset1.hpp" #include "openvino/core/parallel.hpp" -#include "grn.h" +#include "openvino/opsets/opset1.hpp" namespace ov { namespace intel_cpu { @@ -97,11 +98,12 @@ void GRN::execute(dnnl::stream strm) { parallel_for3d(N, H, W, [&](int b, int h, int w) { double variance = 0; for (int c = 0; c < C; c++) { - variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2); + variance += std::pow(src_data[b * C * H * W + c * H * W + h * W + w], 2); } variance = std::pow(variance + bias, 0.5f); for (int c = 0; c < C; c++) { - dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast(variance); + dst_data[b * C * H * W + c * H * W + h * W + w] = + src_data[b * C * H * W + c * H * W + h * W + w] / static_cast(variance); } }); } @@ -110,6 +112,6 @@ bool GRN::created() const { return getType() == Type::GRN; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/grn.h b/src/plugins/intel_cpu/src/nodes/grn.h index 52e77318e2132f..17eac4e81b9d6c 100644 --- a/src/plugins/intel_cpu/src/nodes/grn.h +++ b/src/plugins/intel_cpu/src/nodes/grn.h @@ -14,7 +14,7 @@ class GRN : public Node { public: GRN(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -34,6 +34,6 @@ class GRN : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/if.cpp b/src/plugins/intel_cpu/src/nodes/if.cpp index 1b6102ff954689..88e2c84970d874 100644 --- a/src/plugins/intel_cpu/src/nodes/if.cpp +++ b/src/plugins/intel_cpu/src/nodes/if.cpp @@ -4,22 +4,22 @@ #include "if.h" -#include "openvino/op/if.hpp" +#include +#include #include "common/cpu_memcpy.h" -#include "shape_inference/shape_inference_internal_dyn.hpp" #include "nodes/common/cpu_convert.h" +#include "openvino/op/if.hpp" +#include "shape_inference/shape_inference_internal_dyn.hpp" #include "transformations/utils/utils.hpp" -#include -#include - namespace ov { namespace intel_cpu { namespace node { -If::PortMapHelper::PortMapHelper(const MemoryPtr &from, const std::deque& to, - const dnnl::engine& eng) : srcMemPtr(from), dstMemPtrs(to) { +If::PortMapHelper::PortMapHelper(const MemoryPtr& from, const std::deque& to, const dnnl::engine& eng) + : srcMemPtr(from), + dstMemPtrs(to) { size = 0; if (srcMemPtr->getDesc().isDefined()) size = srcMemPtr->getShape().getElementsCount(); @@ -43,7 +43,7 @@ void If::PortMapHelper::execute(dnnl::stream& strm) { } void If::PortMapHelper::redefineTo() { - const auto &currDesc = dstMemPtrs.front()->getDesc(); + const auto& currDesc = dstMemPtrs.front()->getDesc(); if (currDesc.getShape().isDynamic() || currDesc.getShape().getStaticDims() != srcMemPtr->getStaticDims()) { // TODO : check the entire dstMemPtrs usage considering the proper memory sharing auto newShape = srcMemPtr->getStaticDims(); @@ -60,7 +60,7 @@ bool If::isSupportedOperation(const std::shared_ptr& op, std::st try { if (!one_of(op->get_type_info(), ov::op::v8::If::get_type_info_static())) { errorMessage = "Not supported If operation version " + std::string(op->get_type_info().version_id) + - " with name '" + op->get_friendly_name() + "'. Node If supports only opset8 version."; + " with name '" + op->get_friendly_name() + "'. Node If supports only opset8 version."; return false; } } catch (...) { @@ -69,8 +69,9 @@ bool If::isSupportedOperation(const std::shared_ptr& op, std::st return true; } -If::If(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, InternalDynShapeInferFactory()), ovOp(op) { +If::If(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, InternalDynShapeInferFactory()), + ovOp(op) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -85,11 +86,9 @@ void If::getSupportedDescriptors() { subGraphThen.CreateGraph(thenBody, context); subGraphElse.CreateGraph(elseBody, context); - const auto& inMapThen = subGraphThen.GetInputNodesMap(); for (const auto& param : ifOp->get_then_body()->get_parameters()) { - auto inNode = inMapThen.find(ifOp->get_then_body()->get_parameter_index(param)); - if (inNode != inMapThen.end()) { - inputMemThen.push_back(getToMemories(inNode->second.get(), 0)); + if (auto inNode = subGraphThen.getInputNodeByIndex(ifOp->get_then_body()->get_parameter_index(param))) { + inputMemThen.push_back(getToMemories(inNode.get(), 0)); } else { OPENVINO_THROW("Then body of node If with name ", getName(), @@ -98,11 +97,9 @@ void If::getSupportedDescriptors() { } } - const auto& inMapElse = subGraphElse.GetInputNodesMap(); for (const auto& param : ifOp->get_else_body()->get_parameters()) { - auto inNode = inMapElse.find(ifOp->get_else_body()->get_parameter_index(param)); - if (inNode != inMapElse.end()) { - inputMemElse.push_back(getToMemories(inNode->second.get(), 0)); + if (auto inNode = subGraphElse.getInputNodeByIndex(ifOp->get_else_body()->get_parameter_index(param))) { + inputMemElse.push_back(getToMemories(inNode.get(), 0)); } else { OPENVINO_THROW("Else body of node If with name ", getName(), @@ -111,49 +108,51 @@ void If::getSupportedDescriptors() { } } - const auto &outMapThen = subGraphThen.GetOutputNodesMap(); for (const auto& out : ifOp->get_then_body()->get_results()) { - auto outNode = outMapThen.find(ifOp->get_then_body()->get_result_index(out)); - if (outNode != outMapThen.end()) { - auto outMem = outNode->second->getSrcMemoryAtPort(0); + if (auto outNode = subGraphThen.getOutputNodeByIndex(ifOp->get_then_body()->get_result_index(out))) { + auto outMem = outNode->getSrcMemoryAtPort(0); outputMemThen.push_back(outMem); } else { - OPENVINO_THROW("Then body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); + OPENVINO_THROW("Then body of node If with name ", + getName(), + " does not have output with name: ", + out->get_friendly_name()); } } - const auto &outMapElse = subGraphElse.GetOutputNodesMap(); for (const auto& out : ifOp->get_else_body()->get_results()) { - auto outNode = outMapElse.find(ifOp->get_else_body()->get_result_index(out)); - if (outNode != outMapElse.end()) { - auto outMem = outNode->second->getSrcMemoryAtPort(0); + if (auto outNode = subGraphElse.getOutputNodeByIndex(ifOp->get_else_body()->get_result_index(out))) { + auto outMem = outNode->getSrcMemoryAtPort(0); outputMemElse.push_back(outMem); } else { - OPENVINO_THROW("Else body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); + OPENVINO_THROW("Else body of node If with name ", + getName(), + " does not have output with name: ", + out->get_friendly_name()); } } // Port map: outputs for (const auto& desc : ifOp->get_output_descriptions(0)) { auto body_output_idx = desc->m_body_value_index; - thenOutputPortMap.emplace_back(PortMap { - static_cast(desc->m_output_index), static_cast(body_output_idx)}); + thenOutputPortMap.emplace_back( + PortMap{static_cast(desc->m_output_index), static_cast(body_output_idx)}); } for (const auto& desc : ifOp->get_output_descriptions(1)) { auto body_output_idx = desc->m_body_value_index; - elseOutputPortMap.emplace_back(PortMap { - static_cast(desc->m_output_index), static_cast(body_output_idx)}); + elseOutputPortMap.emplace_back( + PortMap{static_cast(desc->m_output_index), static_cast(body_output_idx)}); } for (const auto& desc : ifOp->get_input_descriptions(0)) { auto body_input_index = desc->m_body_parameter_index; - thenInputPortMap.emplace_back(PortMap { - static_cast(desc->m_input_index), static_cast(body_input_index)}); + thenInputPortMap.emplace_back( + PortMap{static_cast(desc->m_input_index), static_cast(body_input_index)}); } for (const auto& desc : ifOp->get_input_descriptions(1)) { auto body_input_index = desc->m_body_parameter_index; - elseInputPortMap.emplace_back(PortMap { - static_cast(desc->m_input_index), static_cast(body_input_index)}); + elseInputPortMap.emplace_back( + PortMap{static_cast(desc->m_input_index), static_cast(body_input_index)}); } } @@ -166,16 +165,17 @@ void If::initSupportedPrimitiveDescriptors() { config.outConfs.reserve(getChildEdges().size()); for (size_t i = 0; i < inputShapes.size(); i++) { - PortConfig dataConf {}; + PortConfig dataConf{}; auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalInputPrecisionAtPort(i), getInputShapeAtPort(i))); config.inConfs.emplace_back(dataConf); } for (size_t i = 0; i < outputShapes.size(); i++) { - PortConfig dataConf {}; + PortConfig dataConf{}; auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); - dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); + dataConf.setMemDesc( + descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); config.outConfs.push_back(dataConf); } @@ -195,9 +195,9 @@ void If::createPrimitive() { } void If::prepareBeforeMappers(const bool isThen, const dnnl::engine& eng) { - auto &inputPortMap = isThen ? thenInputPortMap : elseInputPortMap; - auto &inputMems = isThen ? inputMemThen : inputMemElse; - auto &beforeMappers = isThen ? beforeThenMappers : beforeElseMappers; + auto& inputPortMap = isThen ? thenInputPortMap : elseInputPortMap; + auto& inputMems = isThen ? inputMemThen : inputMemElse; + auto& beforeMappers = isThen ? beforeThenMappers : beforeElseMappers; for (auto& map_rule : inputPortMap) { auto fromMem = getSrcMemoryAtPort(map_rule.from); auto& toMems = inputMems[map_rule.to]; @@ -216,12 +216,12 @@ void If::prepareBeforeMappers(const bool isThen, const dnnl::engine& eng) { } void If::prepareAfterMappers(const bool isThen, const dnnl::engine& eng) { - auto &outputPortMap = isThen ? thenOutputPortMap : elseOutputPortMap; - auto &outputMems = isThen ? outputMemThen : outputMemElse; - auto &afterMappers = isThen ? afterThenMappers : afterElseMappers; + auto& outputPortMap = isThen ? thenOutputPortMap : elseOutputPortMap; + auto& outputMems = isThen ? outputMemThen : outputMemElse; + auto& afterMappers = isThen ? afterThenMappers : afterElseMappers; for (auto& map_rule : outputPortMap) { auto toMems = getToMemories(this, map_rule.from); - auto &fromMem = outputMems[map_rule.to]; + auto& fromMem = outputMems[map_rule.to]; // Check precision between If node input/output and it's subgrapsh input/output. for (const auto& toMem : toMems) { if (fromMem->getDesc().getPrecision() != toMem->getDesc().getPrecision()) { @@ -250,11 +250,11 @@ void If::execute(dnnl::stream strm) { auto& afterMappers = condition ? afterThenMappers : afterElseMappers; auto& subGraph = condition ? subGraphThen : subGraphElse; - for (auto &mapper : beforeMappers) + for (auto& mapper : beforeMappers) mapper->execute(strm); subGraph.ResetInferCount(); subGraph.Infer(); - for (auto &mapper : afterMappers) + for (auto& mapper : afterMappers) mapper->execute(strm); } @@ -266,6 +266,6 @@ bool If::created() const { return getType() == Type::If; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/if.h b/src/plugins/intel_cpu/src/nodes/if.h index f858c92b0b2651..a2babb45b6c803 100644 --- a/src/plugins/intel_cpu/src/nodes/if.h +++ b/src/plugins/intel_cpu/src/nodes/if.h @@ -4,8 +4,8 @@ #pragma once -#include #include +#include #include #include @@ -25,12 +25,18 @@ class If : public Node { void createPrimitive() override; bool created() const override; void execute(dnnl::stream strm) override; - bool isExecutable() const override { return true; } + bool isExecutable() const override { + return true; + } protected: void executeDynamicImpl(dnnl::stream strm) override; - bool needPrepareParams() const override { return false; }; - bool needShapeInfer() const override { return false; } + bool needPrepareParams() const override { + return false; + }; + bool needShapeInfer() const override { + return false; + } private: void prepareBeforeMappers(const bool isThen, const dnnl::engine& eng); @@ -64,21 +70,14 @@ class If : public Node { std::vector> inputMemThen, inputMemElse; std::deque outputMemThen, outputMemElse; - std::vector> - beforeThenMappers, - beforeElseMappers, - afterThenMappers, + std::vector> beforeThenMappers, beforeElseMappers, afterThenMappers, afterElseMappers; - std::vector - thenInputPortMap, - thenOutputPortMap, - elseInputPortMap, - elseOutputPortMap; + std::vector thenInputPortMap, thenOutputPortMap, elseInputPortMap, elseOutputPortMap; const std::shared_ptr ovOp; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 1f650bd8c5de17..4bb2f714b284fd 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -5,8 +5,11 @@ #include "input.h" #include "cpu/x64/jit_generator.hpp" +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/node_config.h" #include "openvino/core/parallel.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/type/element_type.hpp" #include "shape_inference/shape_inference_pass_through.hpp" using namespace dnnl; @@ -35,16 +38,14 @@ struct jit_has_subnormals_base : public jit_generator { } fn_t get() { - return jit_ker() || create_kernel() == dnnl::impl::status::success - ? (fn_t)jit_ker() - : nullptr; + return jit_ker() || create_kernel() == dnnl::impl::status::success ? (fn_t)jit_ker() : nullptr; } protected: - void foreach(const Xbyak::Reg64& idx, - size_t step, - const Xbyak::Reg64& end, - std::function && fn) { + void foreach (const Xbyak::Reg64& idx, + size_t step, + const Xbyak::Reg64& end, + std::function && fn) { Label loop, exit; L(loop); @@ -58,75 +59,76 @@ struct jit_has_subnormals_base : public jit_generator { L(exit); } - void copy_floats(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size) { + void copy_floats(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size) { push(rsi); push(r15); xor_(rsi, rsi); - foreach(rsi, 1, size, [&, this](const Xbyak::Reg64& idx) { + foreach (rsi, 1, size, [&, this](const Xbyak::Reg64& idx) { mov(r15d, dword[src + idx * sizeof(float)]); mov(dword[dst + idx * sizeof(float)], r15d); - }); + }) + ; pop(r15); pop(rsi); } - void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &exponent_mask, const Xbyak::Ymm &mantissa_mask, const Xbyak::Ymm &zero) { + void check_subnormals(const Xbyak::Reg64& src, + const Xbyak::Ymm& exponent_mask, + const Xbyak::Ymm& mantissa_mask, + const Xbyak::Ymm& zero) { auto a = ymm1; auto b = ymm2; auto c = ymm3; - vmovdqu(a, yword[src]); // load 8 floats - vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111 - vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 - vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000 - vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 - vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 + vmovdqu(a, yword[src]); // load 8 floats + vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111 + vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 + vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000 + vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 + vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 } - void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &exponent_mask, const Xbyak::Xmm &mantissa_mask, const Xbyak::Xmm &zero) { + void check_subnormals(const Xbyak::Reg64& src, + const Xbyak::Xmm& exponent_mask, + const Xbyak::Xmm& mantissa_mask, + const Xbyak::Xmm& zero) { auto a = xmm1; auto b = xmm2; auto c = xmm3; - uni_vmovdqu(a, xword[src]); // load 4 floats - uni_vmovdqu(b, a); // b = a - uni_vmovdqu(c, a); // c = a - uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111 - uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 - uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000 - uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 - uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 + uni_vmovdqu(a, xword[src]); // load 4 floats + uni_vmovdqu(b, a); // b = a + uni_vmovdqu(c, a); // c = a + uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111 + uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 + uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000 + uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 + uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 } protected: Label exit, has_subnormals, no_subnormals; - const Reg64 ®_src = rax; - const Reg64 ®_dst = rbx; - const Reg64 ®_sz = rdx; - const Reg64 ®_idx = rsi; - const Reg64 ®_mask_addr = r15; + const Reg64& reg_src = rax; + const Reg64& reg_dst = rbx; + const Reg64& reg_sz = rdx; + const Reg64& reg_idx = rsi; + const Reg64& reg_mask_addr = r15; static const uint32_t exponent_mask_data[8]; static const uint32_t mantissa_mask_data[8]; }; -const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = { - 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, - 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 -}; +const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = + {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; -const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = { - 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, - 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff -}; +const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = + {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff}; -template +template struct jit_has_subnormals : public jit_has_subnormals_base { using Vmm = typename dnnl::impl::utils::conditional::type; @@ -135,7 +137,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base { const Vmm rmm6 = Vmm(6); const int length = isa == sse41 ? 4 : 8; - void generate() override final { // NOLINT + void generate() override final { // NOLINT size_t const vlen = length; const int sh_bits = std::ilogb(vlen); @@ -162,11 +164,12 @@ struct jit_has_subnormals : public jit_has_subnormals_base { mov(r8, reg_sz); shr(r8, sh_bits); - foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) { + foreach (reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) { check_subnormals(reg_src, exponent_mask, mantissa_mask, zero); jnc(has_subnormals); add(reg_src, sizeof(float) * vlen); - }); + }) + ; // Tail shl(reg_idx, sh_bits); @@ -213,11 +216,11 @@ jit_has_subnormals_base::fn_t jit_has_subnormals_function() { return nullptr; } -} // namespace +} // namespace #endif Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, PassThroughShapeInferFactory()) { + : Node(op, context, PassThroughShapeInferFactory()) { if (!one_of(op->get_type_info(), op::v0::Parameter::get_type_info_static(), op::v0::Constant::get_type_info_static(), @@ -228,9 +231,9 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte op->get_type_name(), " with name ", op->get_friendly_name()); - constOp = ov::as_type_ptr(op); - if (constOp) { + if (auto constOp = ov::as_type_ptr(op)) { constant = ConstantType::Const; + m_constOp = constOp; cloneBlobIfRequired(); } else { constant = ConstantType::StrictNoConst; @@ -238,8 +241,14 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte } void Input::cloneBlobIfRequired() { - Shape shape(constOp->get_shape().empty() ? ov::Shape(1, 1) : constOp->get_shape()); - const auto prec = constOp->get_element_type(); + const auto prec = m_constOp->get_element_type(); + + if (prec == ov::element::undefined && shape_size(m_constOp->get_shape()) == 0) { + memoryPtr = MemoryDescUtils::makeEmptyMemory(context); + return; + } + + Shape shape(m_constOp->get_shape().empty() ? ov::Shape(1, 1) : m_constOp->get_shape()); const size_t size = shape.getElementsCount(); CpuBlockedMemoryDesc memDesc(prec, shape); @@ -251,28 +260,29 @@ void Input::cloneBlobIfRequired() { needFlushDenormalsToZero = false; } - auto cloneBlob = [&, this] () { + auto cloneBlob = [&, this]() { MemoryPtr memory; // CVS-74980 // oneDNN always allocate 1byte for element type with bitWidth < 8 (u4,u1...) // but ngraph Constant uses actual bitWidth for data storage allocation // in that case we make a copy to avoid overflow - if (constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { - if (constOp->get_element_type() == element::string) { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + if (m_constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { + if (m_constOp->get_element_type() == element::string) { + memory = + std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } else { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + memory = std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } } else { - if (constOp->get_element_type() == element::string) { + if (m_constOp->get_element_type() == element::string) { memory = std::make_shared(getEngine(), memDesc); - auto src = constOp->get_data_ptr(); + auto src = m_constOp->get_data_ptr(); auto dst = memory->getDataAs(); std::copy(src, src + size, dst); } else { memory = std::make_shared(getEngine(), memDesc); - memcpy(memory->getData(), constOp->get_data_ptr(), constOp->get_byte_size()); + memcpy(memory->getData(), m_constOp->get_data_ptr(), m_constOp->get_byte_size()); } } @@ -287,22 +297,22 @@ void Input::cloneBlobIfRequired() { return ptr; }; - auto isBlobAligned = [&] () { - bool blobAlignedOnSSE = true; + auto isBlobAligned = [](const std::shared_ptr& constant) { #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // Majority of arithmetic and data processing instructions in legacy SSE isa requires // the memory address in the operands must be aligned on 16-byte boundary. To ensure // safely reusing ngraph const blob memory, need to check address alignment. - const void *ptr = constOp->get_data_ptr(); - blobAlignedOnSSE = mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast(ptr) & 15) == 0); + const void* ptr = constant->get_data_ptr(); + return mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast(ptr) & 15) == 0); +#else + return true; #endif - return blobAlignedOnSSE; }; // The presence of subnormals is better to determined at IR read time. - auto hasSubnormals = [&] () { + auto hasSubnormals = [&]() { if (prec == ov::element::f32) { - uint32_t const *u32data = constOp->get_data_ptr(); + uint32_t const* u32data = m_constOp->get_data_ptr(); if (!size) return false; @@ -316,11 +326,9 @@ void Input::cloneBlobIfRequired() { parallel_for(iterations_num, [&](int n) { auto ptr = u32data + n * batch_size; - const jit_has_subnormals_base::args_t args = { - reinterpret_cast(ptr), - std::min(batch_size, (size_t)(u32data + size - ptr)), - false - }; + const jit_has_subnormals_base::args_t args = {reinterpret_cast(ptr), + std::min(batch_size, (size_t)(u32data + size - ptr)), + false}; fn(&args); @@ -343,12 +351,10 @@ void Input::cloneBlobIfRequired() { return false; }; - auto blobKey = [&] () { + auto blobKey = [&]() { char ptr[32]; - snprintf(ptr, sizeof ptr, "%p", constOp->get_data_ptr()); - return getName() - + "_" + std::to_string(size * prec.size()) - + "_" + ptr; + snprintf(ptr, sizeof ptr, "%p", m_constOp->get_data_ptr()); + return getName() + "_" + std::to_string(size * prec.size()) + "_" + ptr; }; const auto weightCache = context->getWeightsCache(); @@ -356,39 +362,37 @@ void Input::cloneBlobIfRequired() { prec != element::string && // IRs already have all subnormals flushed to zero, but in // read_model scenario with directly loaded original model still can have subnormals - isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) && + isBlobAligned(m_constOp) && (!needFlushDenormalsToZero || !hasSubnormals()) && // Blob should be cloned in cache only if original weights are stored on other numa node. // This is possible only in multistream case on multisocket machine. - // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where original weights are stored. + // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where + // original weights are stored. (!weightCache || context->getNumNumaNodes() == 1 || context->getCPUStreamExecutor()->get_streams_num() == 1); - memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()) + + memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()) : std::const_pointer_cast( weightCache ? *weightCache->findOrCreate(blobKey(), cloneBlob) : cloneBlob()); } -static std::vector createInputShapes(const Shape& shape, - const Type type) { +static std::vector createInputShapes(const Shape& shape, const Type type) { if (type == Type::Output) return {shape}; return {}; } -static std::vector createOutputShapes(const Shape& shape, - const Type type) { +static std::vector createOutputShapes(const Shape& shape, const Type type) { if (type == Type::Input) return {shape}; return {}; } -static std::vector createInputPrecisions(const ov::element::Type& prc, - const Type type) { +static std::vector createInputPrecisions(const ov::element::Type& prc, const Type type) { if (type == Type::Output) return {prc}; return {}; } -static std::vector createOutputPrecisions(const ov::element::Type& prc, - const Type type) { +static std::vector createOutputPrecisions(const ov::element::Type& prc, const Type type) { if (type == Type::Input) return {prc}; return {}; @@ -418,17 +422,13 @@ Input::Input(MemoryDescPtr memDesc, const std::string& name, const std::string& extMemDesc = memDesc; } -Input::Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - InputConfig config) +Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context, InputConfig config) : Input(op, context) { extMemDesc = config.desc; m_isInPlace = config.inPlace; } -Input::Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - OutputConfig config) +Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context, OutputConfig config) : Input(op, context) { extMemDesc = config.desc; m_useParentMemoryDescForOutput = config.useParentMemoryDescForOutput; @@ -489,17 +489,23 @@ void Input::createPrimitive() { for (size_t i = 0; i < getChildEdges().size(); i++) { auto dstMemPtr = getDstMemoryAtPort(i); if (!dstMemPtr) - THROW_CPU_NODE_ERR("has null memory object at port ", i, - " to node ", getChildEdgeAt(i)->getChild()->getName(), "."); + THROW_CPU_NODE_ERR("has null memory object at port ", + i, + " to node ", + getChildEdgeAt(i)->getChild()->getName(), + "."); } for (size_t i = 0; i < getParentEdges().size(); i++) { auto srcMemPtr = getSrcMemoryAtPort(i); if (!srcMemPtr) - THROW_CPU_NODE_ERR("has null memory object at port ", i, - " from node ", getParentEdgeAt(i)->getParent()->getName(), "."); + THROW_CPU_NODE_ERR("has null memory object at port ", + i, + " from node ", + getParentEdgeAt(i)->getParent()->getName(), + "."); } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) THROW_CPU_NODE_ERR("doesn't have selected primitive descriptor."); } @@ -525,9 +531,7 @@ void Input::initSupportedPdDefault() { inPortConfs.push_back({LayoutType::ncsp, precision}); } - addSupportedPrimDesc(inPortConfs, - outPortConfs, - impl_desc_type::unknown); + addSupportedPrimDesc(inPortConfs, outPortConfs, impl_desc_type::unknown); } void Input::initSupportedPdFromMemDesc() { @@ -543,6 +547,6 @@ void Input::initSupportedPdFromMemDesc() { supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 4d7febb17ad4b7..6d1f4c27238540 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -5,6 +5,7 @@ #pragma once #include + #include namespace ov { @@ -42,13 +43,9 @@ class Input : public Node { Input(MemoryDescPtr memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr context); - Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - InputConfig config); + Input(const std::shared_ptr& op, const GraphContext::CPtr context, InputConfig config); - Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - OutputConfig config); + Input(const std::shared_ptr& op, const GraphContext::CPtr context, OutputConfig config); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -66,8 +63,12 @@ class Input : public Node { return false; } - bool needShapeInfer() const override { return false; } - bool needPrepareParams() const override { return false; } + bool needShapeInfer() const override { + return false; + } + bool needPrepareParams() const override { + return false; + } private: void cloneBlobIfRequired(); @@ -75,7 +76,7 @@ class Input : public Node { void initSupportedPdFromMemDesc(); private: - std::shared_ptr constOp; + std::shared_ptr m_constOp; MemoryCPtr memoryPtr; bool isMeanImage = false; MemoryDescPtr extMemDesc = nullptr; @@ -83,6 +84,6 @@ class Input : public Node { bool m_isInPlace = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interaction.cpp b/src/plugins/intel_cpu/src/nodes/interaction.cpp index 5ec48e7a263272..905724c3bc829a 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.cpp +++ b/src/plugins/intel_cpu/src/nodes/interaction.cpp @@ -4,7 +4,10 @@ #include "interaction.h" -#include "transformations/cpu_opset/x64/op/interaction.hpp" +#include +#include +#include + #include "common/bfloat16.hpp" #include "common/cpu_memcpy.h" #include "cpu/x64/cpu_isa_traits.hpp" @@ -16,10 +19,7 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" #include "nodes/common/cpu_convert.h" #include "onednn/dnnl.h" - -#include -#include -#include +#include "transformations/cpu_opset/x64/op/interaction.hpp" using namespace dnnl::impl::cpu::x64; using namespace Xbyak; @@ -36,7 +36,9 @@ template struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_move_scale_kernel) - explicit jit_move_scale_kernel(const jit_move_scale_compile_params& jcp) : jit_uni_move_scale_kernel(jcp), jit_generator(jit_name()) { + explicit jit_move_scale_kernel(const jit_move_scale_compile_params& jcp) + : jit_uni_move_scale_kernel(jcp), + jit_generator(jit_name()) { runtime_prc = jcp_.src_prc == ov::element::bf16 ? ov::element::bf16 : ov::element::f32; if (jcp_.dst_prc == ov::element::i8 || jcp_.dst_prc == ov::element::u8) runtime_prc = ov::element::f32; @@ -50,12 +52,13 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene } private: - using Vmm = typename dnnl::impl::utils::conditional3::type; + using Vmm = + typename dnnl::impl::utils::conditional3::type; void generate() override { this->preamble(); -#define GET_OFF(field) offsetof(jit_move_scale_call_args, field) +# define GET_OFF(field) offsetof(jit_move_scale_call_args, field) mov(reg_in, ptr[reg_params + GET_OFF(p_in)]); mov(reg_out, ptr[reg_params + GET_OFF(p_out)]); mov(reg_work_amount, jcp_.input_size); @@ -107,7 +110,7 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene if (jcp_.with_scales) { if (!jcp_.broadcast_scales) { load(vmm_scales, reg_scales, ov::element::f32, ov::element::f32, step, false); - add(reg_scales, sizeof(float) * step); + add(reg_scales, sizeof(float) * step); } uni_vmulps(vmm_in, vmm_in, vmm_scales); } @@ -119,25 +122,39 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene add(reg_out_aux, jcp_.dst_prc.size() * step); } } -#undef GET_OFF - - inline void load(const Vmm& vmm_dst, const Xbyak::Reg64& reg_src, ov::element::Type src_prc, ov::element::Type dst_prc, const int& elt_num, bool fill) { +# undef GET_OFF + + inline void load(const Vmm& vmm_dst, + const Xbyak::Reg64& reg_src, + ov::element::Type src_prc, + ov::element::Type dst_prc, + const int& elt_num, + bool fill) { const auto seed = load_emitter_params(src_prc, dst_prc, elt_num, fill, "float_min").hash(); if (!emitters[seed]) { - emitters[seed].reset(new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num, src_prc, fill, "float_min")); + emitters[seed].reset( + new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num, src_prc, fill, "float_min")); } - emitters[seed]->emit_code({static_cast(reg_src.getIdx()), 0}, {static_cast(vmm_dst.getIdx())}, - pool_aux_vmm_idxs, pool_aux_gpr_idxs); + emitters[seed]->emit_code({static_cast(reg_src.getIdx()), 0}, + {static_cast(vmm_dst.getIdx())}, + pool_aux_vmm_idxs, + pool_aux_gpr_idxs); } - inline void store(const Xbyak::Reg64& reg_dst, const Vmm& vmm_src, ov::element::Type src_prc, ov::element::Type dst_prc, const int& elt_num) { + inline void store(const Xbyak::Reg64& reg_dst, + const Vmm& vmm_src, + ov::element::Type src_prc, + ov::element::Type dst_prc, + const int& elt_num) { const auto seed = store_emitter_params(src_prc, dst_prc, elt_num).hash(); if (!emitters[seed]) { emitters[seed].reset(new jit_store_emitter(this, isa, src_prc, dst_prc, elt_num)); } - emitters[seed]->emit_code({static_cast(vmm_src.getIdx())}, {static_cast(reg_dst.getIdx())}, - pool_aux_vmm_idxs, pool_aux_gpr_idxs); + emitters[seed]->emit_code({static_cast(vmm_src.getIdx())}, + {static_cast(reg_dst.getIdx())}, + pool_aux_vmm_idxs, + pool_aux_gpr_idxs); } size_t vec_size; @@ -155,13 +172,14 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene Reg64 reg_work_amount = r14; Reg64 reg_params = abi_param1; - const std::vector pool_aux_gpr_idxs = { static_cast(rsi.getIdx()), static_cast(rbp.getIdx()) }; - const std::vector pool_aux_vmm_idxs = { static_cast(xmm_tmp.getIdx()) }; + const std::vector pool_aux_gpr_idxs = {static_cast(rsi.getIdx()), + static_cast(rbp.getIdx())}; + const std::vector pool_aux_vmm_idxs = {static_cast(xmm_tmp.getIdx())}; std::unordered_map> emitters; }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 Interaction::Interaction(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op)) { @@ -174,7 +192,7 @@ Interaction::Interaction(const std::shared_ptr& op, const GraphContext const std::vector& scales = interaction->get_output_scales(); if (!scales.empty()) { fqScales = scales; - outputDataType = interaction->get_output_element_type(0); + outputDataType = interaction->get_output_element_type(0); } } @@ -194,23 +212,12 @@ void Interaction::initSupportedPrimitiveDescriptors() { // initialize input ports std::vector inPortConfigs; for (size_t i = 0; i < getParentEdges().size(); ++i) { - inPortConfigs.emplace_back( - LayoutType::ncsp, - dataPrecision, - getInputShapeAtPort(i), - false, -1); + inPortConfigs.emplace_back(LayoutType::ncsp, dataPrecision, getInputShapeAtPort(i), false, -1); } // initialize output port std::vector outPortConfigs = { - PortConfigurator { - LayoutType::ncsp, - outputDataType, - getOutputShapeAtPort(0), - false, - -1 - } - }; - //add descriptor + PortConfigurator{LayoutType::ncsp, outputDataType, getOutputShapeAtPort(0), false, -1}}; + // add descriptor addSupportedPrimDesc(inPortConfigs, outPortConfigs, impl_desc_type::ref_any); } @@ -221,8 +228,7 @@ static inline void cat(uint8_t* out, size_t elemSize) { size_t offset = 0; for (size_t j = 0; j < feature_sizes.size(); j++) { - cpu_memcpy(out + offset * elemSize, in[j] + bs * feature_sizes[j] * elemSize, - feature_sizes[j] * elemSize); + cpu_memcpy(out + offset * elemSize, in[j] + bs * feature_sizes[j] * elemSize, feature_sizes[j] * elemSize); offset += feature_sizes[j]; } } @@ -303,8 +309,7 @@ void Interaction::prepareParams() { auto matmul_pd = matmul::primitive_desc(getEngine(), src_md, weights_md, dst_md, matmul_attr); prim = matmul(matmul_pd); featureSizes.assign(inputSizes, featureSize); - auto initMemoryPtr = [&](const ov::element::Type& prc, const intel_cpu::Shape& shape, - MemoryPtr& ptr) { + auto initMemoryPtr = [&](const ov::element::Type& prc, const intel_cpu::Shape& shape, MemoryPtr& ptr) { ptr = std::make_shared(getEngine(), intel_cpu::DnnlBlockedMemoryDesc(prc, shape)); }; initMemoryPtr(dataPrecision, intel_cpu::Shape{inputSizes, featureSize}, inputMemPtr); @@ -336,7 +341,7 @@ void Interaction::prepareParams() { moveFeatureKernel.reset(new jit_move_scale_kernel(jcp)); moveInteractKernel.reset(new jit_move_scale_kernel(interJcp)); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 if (moveFeatureKernel && moveInteractKernel) { moveFeatureKernel->create_ker(); @@ -360,8 +365,7 @@ bool Interaction::isExecutable() const { return true; } -bool Interaction::isSupportedOperation(const std::shared_ptr& op, - std::string& errorMessage) noexcept { +bool Interaction::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { const auto interaction = std::dynamic_pointer_cast(op); if (!interaction) { @@ -374,7 +378,6 @@ bool Interaction::isSupportedOperation(const std::shared_ptr& op return true; } - -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interaction.h b/src/plugins/intel_cpu/src/nodes/interaction.h index 448484a2512dd1..794ea0af24a87c 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.h +++ b/src/plugins/intel_cpu/src/nodes/interaction.h @@ -19,31 +19,31 @@ struct jit_move_scale_compile_params { }; struct jit_move_scale_call_args { - const void *p_in; - void *p_out; - const void *p_scales; + const void* p_in; + void* p_out; + const void* p_scales; }; struct jit_uni_move_scale_kernel { - void (*ker_)(const jit_move_scale_call_args*); + void (*ker_)(const jit_move_scale_call_args*); - void operator()(const jit_move_scale_call_args* call_args) { - assert(ker_); - ker_(call_args); - } + void operator()(const jit_move_scale_call_args* call_args) { + assert(ker_); + ker_(call_args); + } - explicit jit_uni_move_scale_kernel(const jit_move_scale_compile_params& jcp) : ker_(nullptr), jcp_(jcp) {} - virtual ~jit_uni_move_scale_kernel() {} + explicit jit_uni_move_scale_kernel(const jit_move_scale_compile_params& jcp) : ker_(nullptr), jcp_(jcp) {} + virtual ~jit_uni_move_scale_kernel() {} - virtual void create_ker() = 0; + virtual void create_ker() = 0; - jit_move_scale_compile_params jcp_; + jit_move_scale_compile_params jcp_; }; class Interaction : public Node { public: Interaction(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -74,6 +74,6 @@ class Interaction : public Node { std::unique_ptr moveInteractKernel; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index 7eed5c1df9789b..beb53cb89a831e 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -4,6 +4,10 @@ #include "interpolate.h" +#include +#include +#include + #include "common/cpu_memcpy.h" #include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp" #include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" @@ -21,16 +25,11 @@ #include "openvino/opsets/opset11.hpp" #include "openvino/opsets/opset4.hpp" #include "shape_inference/shape_inference.hpp" -#include "shape_inference/shape_inference_ngraph.hpp" #include "shape_inference/static_shape.hpp" #include "utils/bfloat16.hpp" #include "utils/cpu_utils.hpp" #include "utils/ngraph_utils.hpp" -#include -#include -#include - using namespace dnnl; using namespace dnnl::impl; @@ -39,7 +38,6 @@ using namespace dnnl::impl::cpu::x64; using namespace dnnl::impl::utils; using namespace Xbyak; - #define GET_OFF(field) offsetof(jit_interpolate_call_args, field) namespace ov { @@ -56,8 +54,9 @@ template struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32) - explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) - : jit_uni_interpolate_kernel(jcp, attr), jit_generator(jit_name()) {} + explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const dnnl_primitive_attr& attr) + : jit_uni_interpolate_kernel(jcp, attr), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -70,23 +69,24 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi store_pool_gpr_idxs = {static_cast(reg_tmp_64.getIdx())}; store_pool_vec_idxs = {static_cast(vmm_zero.getIdx())}; - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; for (int i = 0; i < p.len(); i++) { - auto &post_op = p.entry_[i]; + auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { - eltwise_injectors.push_back(std::make_shared>( - this, - post_op.eltwise.alg, - post_op.eltwise.alpha, - post_op.eltwise.beta, - 1.f)); + eltwise_injectors.push_back(std::make_shared>(this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta, + 1.f)); } else if (post_op.is_depthwise()) { - depthwise_injectors.push_back(std::make_shared>( - this, - post_op)); + depthwise_injectors.push_back(std::make_shared>(this, post_op)); } else if (post_op.is_quantization()) { - quantization_injectors.push_back(std::make_shared>( - this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); + quantization_injectors.push_back(std::make_shared>(this, + post_op, + vmm_d_weights, + vmm_d_bias, + reg_d_weights, + reg_d_bias)); } } @@ -99,81 +99,82 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vpxor(vmm_zero, vmm_zero, vmm_zero); switch (jcp_.mode) { - case InterpolateMode::nearest: { - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); - mov(reg_index, ptr[reg_params + GET_OFF(index)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - nn_planar(); - break; - } - case InterpolateLayoutType::block: { - nn_blk(); - break; - } - case InterpolateLayoutType::by_channel: { - nn_by_channel(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with nearest neighbor mode."); - } + case InterpolateMode::nearest: { + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); + mov(reg_index, ptr[reg_params + GET_OFF(index)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + nn_planar(); break; } - case InterpolateMode::linear_onnx: { - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - linear_onnx_planar(); - break; - } - case InterpolateLayoutType::block: - case InterpolateLayoutType::by_channel: { - linear_onnx_c_gathered(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with linear_onnx mode."); - } + case InterpolateLayoutType::block: { + nn_blk(); break; } - case InterpolateMode::cubic: { - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - cubic_planar(); - break; - } - case InterpolateLayoutType::block: - case InterpolateLayoutType::by_channel: { - cubic_c_gathered(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with cubic mode."); - } + case InterpolateLayoutType::by_channel: { + nn_by_channel(); break; } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - switch (jcp_.layout) { - case InterpolateLayoutType::by_channel: { - pillow_by_channel(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with bilinear_pillow and bicubic_pillow modes."); - } + default: + assert(!"unsupported memory layout for interpolate layer with nearest neighbor mode."); + } + break; + } + case InterpolateMode::linear_onnx: { + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + linear_onnx_planar(); break; } - case InterpolateMode::linear: { - assert(!"unsupported mode for interpolate layer with JITTED implimentation."); + case InterpolateLayoutType::block: + case InterpolateLayoutType::by_channel: { + linear_onnx_c_gathered(); break; } - default: { - assert(!"unsupported mode for interpolate layer."); + default: + assert(!"unsupported memory layout for interpolate layer with linear_onnx mode."); } + break; + } + case InterpolateMode::cubic: { + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + cubic_planar(); + break; + } + case InterpolateLayoutType::block: + case InterpolateLayoutType::by_channel: { + cubic_c_gathered(); + break; + } + default: + assert(!"unsupported memory layout for interpolate layer with cubic mode."); + } + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + switch (jcp_.layout) { + case InterpolateLayoutType::by_channel: { + pillow_by_channel(); + break; + } + default: + assert( + !"unsupported memory layout for interpolate layer with bilinear_pillow and bicubic_pillow modes."); + } + break; + } + case InterpolateMode::linear: { + assert(!"unsupported mode for interpolate layer with JITTED implimentation."); + break; + } + default: { + assert(!"unsupported mode for interpolate layer."); + } } this->postamble(); @@ -187,8 +188,8 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; const int vlen = cpu_isa_traits::vlen; const int vector_step = vlen / sizeof(float); @@ -217,7 +218,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // for cubic planar Xbyak::Reg64 reg_tbl_y = rsi; Xbyak::Reg64 reg_tbl_x = rbp; - Xbyak::Reg64 reg_table = rdx; // do not need reg_index_offset in this mode, so use rdx + Xbyak::Reg64 reg_table = rdx; // do not need reg_index_offset in this mode, so use rdx Vmm vmm_val = Vmm(1); Vmm vmm_index = Vmm(0); @@ -293,14 +294,21 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi emit_load(reg_src, vmm_src, ov::element::f32, ov::element::f32, elt_num, offset); } - inline void emit_load(Xbyak::Reg64 reg_src, Vmm vmm_src, ov::element::Type src_prc, ov::element::Type dst_prc, const int elt_num, const int offset = 0) { + inline void emit_load(Xbyak::Reg64 reg_src, + Vmm vmm_src, + ov::element::Type src_prc, + ov::element::Type dst_prc, + const int elt_num, + const int offset = 0) { const auto seed = load_emitter_params(src_prc, dst_prc, elt_num).hash(); if (!emitters[seed]) { emitters[seed].reset(new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num)); } emitters[seed]->emit_code({static_cast(reg_src.getIdx()), static_cast(offset)}, - {static_cast(vmm_src.getIdx())}, {}, {load_pool_gpr_idxs}); + {static_cast(vmm_src.getIdx())}, + {}, + {load_pool_gpr_idxs}); } inline void store(Vmm vmm_dst, Xbyak::Reg64 reg_dst, const int elt_num, const int offset = 0) { @@ -310,12 +318,15 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } // for cases when Store emitter need 2 aux vmm we can use vmm_dst as second aux vmm - std::vector local_store_pool_vec_idxs = { static_cast(vmm_dst.getIdx()) }; - local_store_pool_vec_idxs.insert(local_store_pool_vec_idxs.begin(), store_pool_vec_idxs.begin(), store_pool_vec_idxs.end()); + std::vector local_store_pool_vec_idxs = {static_cast(vmm_dst.getIdx())}; + local_store_pool_vec_idxs.insert(local_store_pool_vec_idxs.begin(), + store_pool_vec_idxs.begin(), + store_pool_vec_idxs.end()); emitters[seed]->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(reg_dst.getIdx()), static_cast(offset)}, - {local_store_pool_vec_idxs}, {store_pool_gpr_idxs}); + {local_store_pool_vec_idxs}, + {store_pool_gpr_idxs}); } // kernel for OH * OW * C @@ -398,9 +409,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } // if int, round if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } - // src_prc, dst_prc and buf ov::element::Type is the same, otherwise need another store with buf(src) precision + // src_prc, dst_prc and buf ov::element::Type is the same, otherwise need another store with + // buf(src) precision store(vmm_dst, reg_dst_aux, vector_step); add(reg_dst_aux, vector_step * jcp_.src_data_size); // advance 8/16 faciliate next block @@ -416,7 +428,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vfmadd231ps(vmm_dst, vmm_val, vmm_weight); } if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } store(vmm_dst, reg_dst_aux, tail_num); add(reg_dst_aux, tail_num * jcp_.src_data_size); @@ -448,7 +460,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vfmadd231ps(vmm_dst, vmm_val, vmm_weight); } if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } store(vmm_dst, reg_dst, vector_step); add(reg_dst, vector_step * jcp_.dst_data_size); @@ -464,7 +476,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vfmadd231ps(vmm_dst, vmm_val, vmm_weight); } if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } store(vmm_dst, reg_dst, tail_num); add(reg_dst, tail_num * jcp_.dst_data_size); @@ -496,7 +508,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cmp(reg_work_amount_oh, 1); jl(out_loop_end, T_NEAR); - //reset work_amount to OW + // reset work_amount to OW mov(reg_work_amount, jcp_.OW); Xbyak::Reg64 reg_src_h = rsi; @@ -513,7 +525,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi Xbyak::Label nn_tail_loop_label; Xbyak::Label nn_tail_loop_end_label; - L(nn_loop_label); // inner loop + L(nn_loop_label); // inner loop { cmp(reg_work_amount, vector_step); jl(nn_loop_end_label, T_NEAR); @@ -553,9 +565,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi jmp(nn_tail_loop_label, T_NEAR); } - L(nn_tail_loop_end_label); // inner loop end + L(nn_tail_loop_end_label); // inner loop end - //increment index_h to next row + // increment index_h to next row add(reg_index_h, jcp_.indices_size); sub(reg_work_amount_oh, 1); @@ -621,7 +633,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cmp(reg_work_amount_out, 1); jl(out_loop_end, T_NEAR); - //inner loop for C + // inner loop for C Xbyak::Label nn_loop_label; Xbyak::Label nn_loop_end_label; Xbyak::Label nn_tail_loop_label; @@ -717,10 +729,12 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); int blk = (isa == cpu::x64::sse41) ? (2 * vector_step) : vector_step; - int dst_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.dst_data_size) : - (blk * jcp_.OW * jcp_.OH * jcp_.OD * jcp_.dst_data_size); - int src_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.src_data_size) : - (blk * jcp_.IW * jcp_.IH * jcp_.ID * jcp_.src_data_size); + int dst_stride = (jcp_.layout == InterpolateLayoutType::by_channel) + ? (vector_step * jcp_.dst_data_size) + : (blk * jcp_.OW * jcp_.OH * jcp_.OD * jcp_.dst_data_size); + int src_stride = (jcp_.layout == InterpolateLayoutType::by_channel) + ? (vector_step * jcp_.src_data_size) + : (blk * jcp_.IW * jcp_.IH * jcp_.ID * jcp_.src_data_size); Xbyak::Label main_loop_label; Xbyak::Label main_loop_end_label; @@ -758,8 +772,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // 2d for end depth linear_onnx_worker_2d(); // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -789,8 +805,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // 2d for end depth linear_onnx_worker_2d(); // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -814,9 +832,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi add(reg_src_aux7, src_stride); } if (jcp_.layout == InterpolateLayoutType::by_channel) { - sub(reg_work_amount, vector_step); // work_amount is c + sub(reg_work_amount, vector_step); // work_amount is c } else { - sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails + sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails } jmp(main_loop_label, T_NEAR); @@ -844,8 +862,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // 2d for end depth linear_onnx_worker_2d(); // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -930,8 +950,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi load_weights(reg_src_aux, vmm_weightE, vector_step, 5 * weight_stride); load_weights(reg_src_aux, vmm_weightF, vector_step, 4 * weight_stride); - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -1014,8 +1036,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi load_weights(reg_src_aux, vmm_weightE, scalar_step, 5 * weight_stride); load_weights(reg_src_aux, vmm_weightF, scalar_step, 4 * weight_stride); - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -1090,7 +1114,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cubic_c_gathered_matrix(false); if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value to post_ops and store + apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value to post_ops and store add(reg_oc_off, vector_step * sizeof(float)); } store(vmm_val, reg_dst, vector_step); @@ -1118,7 +1142,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int src_stride = vector_step * jcp_.src_data_size; add(reg_dst, dst_stride); add(reg_src, src_stride); - sub(reg_work_amount, vector_step); // work_amount is c + sub(reg_work_amount, vector_step); // work_amount is c } else { int dst_stride = blk * jcp_.OW * jcp_.OH * jcp_.dst_data_size; int src_stride = blk * jcp_.IW * jcp_.IH * jcp_.src_data_size; @@ -1143,7 +1167,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cubic_c_gathered_matrix(true); if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value + apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value add(reg_oc_off, scalar_step * sizeof(float)); } store(vmm_val, reg_dst, scalar_step); @@ -1152,7 +1176,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int src_stride = scalar_step * jcp_.src_data_size; add(reg_dst, dst_stride); add(reg_src, src_stride); - sub(reg_work_amount, scalar_step); // work_amount is c + sub(reg_work_amount, scalar_step); // work_amount is c jmp(tail_loop_label, T_NEAR); } @@ -1243,7 +1267,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // build weightX used in y0-y3 // weight format: w0_0 w1_0 w2_0 w3_0 w0_1 w1_1 w2_1 w3_1 ... uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightX0, ptr[reg_weight_x + vmm_val * grid_len], vmm_mask); // 4 in vmm_val for weight_size, another 4 for grid_len + vgatherdps(vmm_weightX0, + ptr[reg_weight_x + vmm_val * grid_len], + vmm_mask); // 4 in vmm_val for weight_size, another 4 for grid_len uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); // shift weight_size then gather second weight @@ -1327,8 +1353,20 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // gather weightX by input idx, used in y0-y3 gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, ov::element::f32, true); gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, ov::element::f32, true); - gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, ov::element::f32, true); - gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, ov::element::f32, true); + gather_i32_indices(vmm_weightX2, + reg_weight_x, + 2 * sizeof(float), + vmm_val, + grid_len, + ov::element::f32, + true); + gather_i32_indices(vmm_weightX3, + reg_weight_x, + 3 * sizeof(float), + vmm_val, + grid_len, + ov::element::f32, + true); // vmm_val is now relieved and used for dst_value uni_vpxor(vmm_val, vmm_val, vmm_val); @@ -1355,7 +1393,13 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); // weight y2 - gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, ov::element::f32, true); + gather_i32_indices(vmm_weightY, + reg_weight_y, + 2 * sizeof(float), + vmm_tbl_y, + grid_len, + ov::element::f32, + true); cubic_planar_line(true); // y3 @@ -1365,7 +1409,13 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); // weight y3 - gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, ov::element::f32, true); + gather_i32_indices(vmm_weightY, + reg_weight_y, + 3 * sizeof(float), + vmm_tbl_y, + grid_len, + ov::element::f32, + true); cubic_planar_line(true); if (attr_.post_ops_.len() != 0) { @@ -1454,8 +1504,13 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } // always gather to Vmm, compute with Vmm, store with Xmm if scalar_step - inline void gather_i32_indices(Vmm vmm_src, const Xbyak::Reg64 &base, int offset, Vmm vmm_indices, int scale, - ov::element::Type src_prc, bool is_scalar) { + inline void gather_i32_indices(Vmm vmm_src, + const Xbyak::Reg64& base, + int offset, + Vmm vmm_indices, + int scale, + ov::element::Type src_prc, + bool is_scalar) { Xbyak::Address table_idx = ptr[base + offset + vmm_indices * scale]; if ((isa == cpu::x64::avx512_core) && !is_scalar) { // [0-15] bit of int to mask @@ -1484,8 +1539,8 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int repeats = is_scalar ? 1 : vlen / sizeof(float); for (int i = 0; i < repeats; ++i) { - mov(reg_tmp_64.cvt32(), ptr[rsp + i * sizeof(int)]); // sizeof(int) index_size - table_idx = ptr[base + offset + reg_tmp_64 * scale]; // scale: sizeof(float) value_size + mov(reg_tmp_64.cvt32(), ptr[rsp + i * sizeof(int)]); // sizeof(int) index_size + table_idx = ptr[base + offset + reg_tmp_64 * scale]; // scale: sizeof(float) value_size mov(reg_tmp_64.cvt32(), table_idx); mov(ptr[rsp + i * sizeof(int)], reg_tmp_64.cvt32()); } @@ -1498,9 +1553,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } } - // is_broadcast for broadcasting param for depth_wise and quantize(channel-sensitive post-ops), for fusion with plain layout. + // is_broadcast for broadcasting param for depth_wise and quantize(channel-sensitive post-ops), for fusion with + // plain layout. void apply_post_ops(ov::element::Type dst_prc, bool is_broadcast) { - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; int eltwise_inj_idx = 0; int depthwise_inj_idx = 0; int quantization_inj_idx = 0; @@ -1515,8 +1571,11 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi add(reg_d_weights, reg_oc_off); // weight and bias is padded. scalar as vector. - depthwise_injectors[depthwise_inj_idx]->compute_vector_range( - vmm_val.getIdx(), vmm_val.getIdx() + 1, reg_d_weights, reg_d_weights, is_broadcast); + depthwise_injectors[depthwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), + vmm_val.getIdx() + 1, + reg_d_weights, + reg_d_weights, + is_broadcast); post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep(); depthwise_inj_idx++; @@ -1526,15 +1585,25 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int s_idx = vmm_val.getIdx(); - quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, + reg_oc_off); quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, 0, is_broadcast); - quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); + quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs( + reg_post_ops_data + post_ops_data_offset, + reg_oc_off); + quantization_injectors[quantization_inj_idx] + ->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); if (do_dequantization) { - quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, 0, is_broadcast); + quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs( + reg_post_ops_data + post_ops_data_offset, + reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, + s_idx + 1, + 0, + 0, + is_broadcast); } post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep(); @@ -1544,7 +1613,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 namespace { struct InterpolateKey { @@ -1586,7 +1655,7 @@ size_t InterpolateKey::hash() const { return seed; } -bool InterpolateKey::operator==(const InterpolateKey &rhs) const { +bool InterpolateKey::operator==(const InterpolateKey& rhs) const { if (nodeAttrs.mode != rhs.nodeAttrs.mode) return false; if (nodeAttrs.coordTransMode != rhs.nodeAttrs.coordTransMode) @@ -1620,7 +1689,7 @@ bool InterpolateKey::operator==(const InterpolateKey &rhs) const { return true; } -} // namespace +} // namespace // shapeND: n c d h w // blockND: ncdhw cdhw dhw hw w 1 @@ -1629,7 +1698,7 @@ inline VectorDims getBlockND(const VectorDims& shape) { int shapeRank = shape.size(); VectorDims blockND(shapeRank + 1, 1); for (int i = shapeRank - 1; i >= 0; i--) { - blockND[i] = shape[i] * blockND[i+1]; + blockND[i] = shape[i] * blockND[i + 1]; } return blockND; } @@ -1665,32 +1734,47 @@ using ngInterpShapeCalcMode = ov::op::v4::Interpolate::ShapeCalcMode; bool Interpolate::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (const auto interp = std::dynamic_pointer_cast(op)) { - const auto &interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; - if (!one_of(interpMode, ngInterpMode::NEAREST, ngInterpMode::LINEAR, ngInterpMode::LINEAR_ONNX, ngInterpMode::CUBIC)) { + const auto& interpAttr = interp->get_attrs(); + const auto& interpMode = interpAttr.mode; + if (!one_of(interpMode, + ngInterpMode::NEAREST, + ngInterpMode::LINEAR, + ngInterpMode::LINEAR_ONNX, + ngInterpMode::CUBIC)) { errorMessage = "Interpolate-4 does not support interpolate mode: " + ov::as_string(interpMode); return false; } - const auto &interpCoordTransMode = interpAttr.coordinate_transformation_mode; - if (!one_of(interpCoordTransMode, ngInterpCoordTransf::HALF_PIXEL, ngInterpCoordTransf::PYTORCH_HALF_PIXEL, ngInterpCoordTransf::ASYMMETRIC, - ngInterpCoordTransf::TF_HALF_PIXEL_FOR_NN, ngInterpCoordTransf::ALIGN_CORNERS)) { - errorMessage = "Interpolate-4 does not support coordinate transformation mode: " + ov::as_string(interpCoordTransMode); + const auto& interpCoordTransMode = interpAttr.coordinate_transformation_mode; + if (!one_of(interpCoordTransMode, + ngInterpCoordTransf::HALF_PIXEL, + ngInterpCoordTransf::PYTORCH_HALF_PIXEL, + ngInterpCoordTransf::ASYMMETRIC, + ngInterpCoordTransf::TF_HALF_PIXEL_FOR_NN, + ngInterpCoordTransf::ALIGN_CORNERS)) { + errorMessage = "Interpolate-4 does not support coordinate transformation mode: " + + ov::as_string(interpCoordTransMode); return false; } if (interpMode == ngInterpMode::NEAREST) { - const auto &interpNearestMode = interpAttr.nearest_mode; - if (!one_of(interpNearestMode, ngInterpNearMode::ROUND_PREFER_FLOOR, ngInterpNearMode::ROUND_PREFER_CEIL, ngInterpNearMode::FLOOR, - ngInterpNearMode::CEIL, ngInterpNearMode::SIMPLE)) { - errorMessage = "Interpolate-4 does not support nearest round mode: " + ov::as_string(interpNearestMode); + const auto& interpNearestMode = interpAttr.nearest_mode; + if (!one_of(interpNearestMode, + ngInterpNearMode::ROUND_PREFER_FLOOR, + ngInterpNearMode::ROUND_PREFER_CEIL, + ngInterpNearMode::FLOOR, + ngInterpNearMode::CEIL, + ngInterpNearMode::SIMPLE)) { + errorMessage = + "Interpolate-4 does not support nearest round mode: " + ov::as_string(interpNearestMode); return false; } } - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (!one_of(interpShapeCalcMode, ngInterpShapeCalcMode::SCALES, ngInterpShapeCalcMode::SIZES)) { - errorMessage = "Interpolate-4 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); + errorMessage = + "Interpolate-4 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); return false; } @@ -1701,7 +1785,8 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr& op } if (dataRank == 5 && interpMode == ngInterpMode::CUBIC) { - errorMessage = "Interpolate-4 doesn't support input tensor with rank: " + std::to_string(dataRank) + " for 'cubic' mode "; + errorMessage = "Interpolate-4 doesn't support input tensor with rank: " + std::to_string(dataRank) + + " for 'cubic' mode "; return false; } @@ -1711,21 +1796,22 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr& op return false; } - if (interp->get_input_size() > 3 && - std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID)) == nullptr) { + if (interp->get_input_size() > 3 && std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(AXES_ID)) == nullptr) { errorMessage = "Only const 'axes' input is supported in Interpolate-4"; return false; } } else if (const auto interp = std::dynamic_pointer_cast(op)) { - const auto &interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; + const auto& interpAttr = interp->get_attrs(); + const auto& interpMode = interpAttr.mode; if (!one_of(interpMode, ngInterpMode::BILINEAR_PILLOW, ngInterpMode::BICUBIC_PILLOW)) { errorMessage = "Interpolate-11 does not support interpolate mode: " + ov::as_string(interpMode); return false; } - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (!one_of(interpShapeCalcMode, ngInterpShapeCalcMode::SCALES, ngInterpShapeCalcMode::SIZES)) { - errorMessage = "Interpolate-11 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); + errorMessage = + "Interpolate-11 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); return false; } const size_t dataRank = interp->get_input_partial_shape(DATA_ID).rank().get_length(); @@ -1735,12 +1821,12 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr& op return false; } if (!isDynamicNgraphNode(op) && - !ov::is_type(op->get_input_node_ptr(SIZE_OR_SCALE_ID_V11))) { + !ov::is_type(op->get_input_node_ptr(SIZE_OR_SCALE_ID_V11))) { errorMessage = "Only const 'scales_or_sizes' input is supported for static shapes in Interpolate-11"; return false; } - if (interp->get_input_size() > 2 && - std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID_V11)) == nullptr) { + if (interp->get_input_size() > 2 && std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(AXES_ID_V11)) == nullptr) { errorMessage = "Only const 'axes' input is supported in Interpolate-11"; return false; } @@ -1763,19 +1849,14 @@ class InterpolateShapeInferFactory : public ShapeInferFactory { public: InterpolateShapeInferFactory(std::shared_ptr op) : m_op(op) {} ShapeInferPtr makeShapeInfer() const override { - IShapeInfer::port_mask_t port_mask = 0x00; if (auto interp4 = ov::as_type_ptr(m_op)) { - const auto &attr = interp4->get_attrs(); - - if (attr.shape_calculation_mode == ngInterpShapeCalcMode::SCALES) { - port_mask = PortMask(Interpolate::SCALES_ID, Interpolate::AXES_ID); - } else if (attr.shape_calculation_mode == ngInterpShapeCalcMode::SIZES) { - port_mask = PortMask(Interpolate::TARGET_SHAPE_ID, Interpolate::AXES_ID); - } else { - OPENVINO_ASSERT(false, "Unsupported interpolate shape calculation mode"); - } + const auto& attr = interp4->get_attrs(); + const auto is_supported_mode = (attr.shape_calculation_mode == ngInterpShapeCalcMode::SCALES) || + (attr.shape_calculation_mode == ngInterpShapeCalcMode::SIZES); + OPENVINO_ASSERT(is_supported_mode, "Unsupported interpolate shape calculation mode"); + return make_shape_inference(m_op); } else if (auto interp11 = ov::as_type_ptr(m_op)) { - port_mask = PortMask(Interpolate::SIZE_OR_SCALE_ID_V11, Interpolate::AXES_ID_V11); + return make_shape_inference(m_op); } else { OPENVINO_THROW("Shape infer factory cannot be created for ", m_op->get_type_name(), @@ -1783,16 +1864,15 @@ class InterpolateShapeInferFactory : public ShapeInferFactory { m_op->get_friendly_name(), ", only versions 4 and 11 are supported."); } - return std::make_shared(make_shape_inference(m_op), port_mask); } private: std::shared_ptr m_op; }; -} // namespace +} // namespace Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, InterpolateShapeInferFactory(op)) { + : Node(op, context, InterpolateShapeInferFactory(op)) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { errorPrefix = "Interpolate node with name '" + getName() + "'"; @@ -1806,9 +1886,9 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext OPENVINO_THROW(errorPrefix, " has incorrect number of output edges"); isAxesSpecified = numInputs != 3; - const auto &interpAttr = interp->get_attrs(); + const auto& interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; + const auto& interpMode = interpAttr.mode; if (interpMode == ngInterpMode::NEAREST) { interpAttrs.mode = InterpolateMode::nearest; } else if (interpMode == ngInterpMode::LINEAR) { @@ -1825,7 +1905,7 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext OPENVINO_THROW(errorPrefix, " has unsupported interpolate mode"); } - const auto &interpCoordTransMode = interpAttr.coordinate_transformation_mode; + const auto& interpCoordTransMode = interpAttr.coordinate_transformation_mode; if (interpCoordTransMode == ngInterpCoordTransf::HALF_PIXEL) { interpAttrs.coordTransMode = InterpolateCoordTransMode::half_pixel; } else if (interpCoordTransMode == ngInterpCoordTransf::PYTORCH_HALF_PIXEL) { @@ -1841,7 +1921,7 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext } if (interpAttrs.mode == InterpolateMode::nearest) { - const auto &interpNearestMode = interpAttr.nearest_mode; + const auto& interpNearestMode = interpAttr.nearest_mode; if (interpNearestMode == ngInterpNearMode::ROUND_PREFER_FLOOR) { interpAttrs.nearestMode = InterpolateNearestMode::round_prefer_floor; } else if (interpNearestMode == ngInterpNearMode::ROUND_PREFER_CEIL) { @@ -1860,7 +1940,7 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext } interpAttrs.antialias = interpAttr.antialias; - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (interpShapeCalcMode == ngInterpShapeCalcMode::SCALES) { interpAttrs.shapeCalcMode = InterpolateShapeCalcMode::scales; } else if (interpShapeCalcMode == ngInterpShapeCalcMode::SIZES) { @@ -1885,14 +1965,16 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext interpAttrs.padEnd[i] = static_cast(interpAttr.pads_end[i]); } - const auto scalesNode = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(SCALES_ID)); + const auto scalesNode = + std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(SCALES_ID)); if (scalesNode) { scales = scalesNode->cast_vector(); isScaleConstant = true; } if (isAxesSpecified) { - axes = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID))->cast_vector(); + axes = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID)) + ->cast_vector(); } else { axes.resize(dataRank); for (int i = 0; i < static_cast(dataRank); i++) { @@ -1908,13 +1990,13 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext OPENVINO_THROW(errorPrefix, " has incorrect number of output edges"); isAxesSpecified = numInputs != 2; - const auto &interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; + const auto& interpAttr = interp->get_attrs(); + const auto& interpMode = interpAttr.mode; if (interpMode == ngInterpMode::BILINEAR_PILLOW) { interpAttrs.mode = InterpolateMode::bilinear_pillow; } else if (interpMode == ngInterpMode::BICUBIC_PILLOW) { interpAttrs.mode = InterpolateMode::bicubic_pillow; - interpAttrs.cubeCoeff = static_cast(interpAttr.cube_coeff); // fixed to be -0.5 + interpAttrs.cubeCoeff = static_cast(interpAttr.cube_coeff); // fixed to be -0.5 } else { OPENVINO_THROW(errorPrefix, " has unsupported interpolate mode"); } @@ -1923,10 +2005,11 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext interpAttrs.coordTransMode = InterpolateCoordTransMode::tf_half_pixel_for_nn; interpAttrs.antialias = interpAttr.antialias; - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (interpShapeCalcMode == ngInterpShapeCalcMode::SCALES) { interpAttrs.shapeCalcMode = InterpolateShapeCalcMode::scales; - const auto scalesNode = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(SIZE_OR_SCALE_ID_V11)); + const auto scalesNode = std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(SIZE_OR_SCALE_ID_V11)); if (scalesNode) { scales = scalesNode->cast_vector(); isScaleConstant = true; @@ -1954,7 +2037,9 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext } if (isAxesSpecified) { - axes = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID_V11))->cast_vector(); + axes = std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(AXES_ID_V11)) + ->cast_vector(); if (dataRank == 4 && axes.size() == 2 && axes[0] == 1 && axes[1] == 2 && mayiuse(cpu::x64::sse41)) { NCHWAsNHWC = true; axes[0] = 2; @@ -1993,7 +2078,7 @@ void Interpolate::getSupportedDescriptors() { break; } } - //correct pad + // correct pad if (hasPad) { NCHWAsNHWC = false; auto correctPad = [&](std::vector pad, int rank) { @@ -2071,15 +2156,21 @@ void Interpolate::initSupportedPrimitiveDescriptors() { } } auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - auto pushDesc = [&](LayoutType dataFormat, impl_desc_type implDetail, bool is_version11, bool useAclExecutor = false) { - config.inConfs[DATA_ID].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(inputPrecision, getInputShapeAtPort(DATA_ID))); + auto pushDesc = [&](LayoutType dataFormat, + impl_desc_type implDetail, + bool is_version11, + bool useAclExecutor = false) { + config.inConfs[DATA_ID].setMemDesc( + creatorsMap.at(dataFormat)->createSharedDesc(inputPrecision, getInputShapeAtPort(DATA_ID))); if (is_version11) { if (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::sizes) { config.inConfs[SIZE_OR_SCALE_ID_V11].setMemDesc( - creatorsMap.at(LayoutType::ncsp)->createSharedDesc(targetShapeType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(targetShapeType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); } else { config.inConfs[SIZE_OR_SCALE_ID_V11].setMemDesc( - creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(scalesType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); } if (isAxesSpecified) @@ -2087,14 +2178,18 @@ void Interpolate::initSupportedPrimitiveDescriptors() { creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(AXES_ID_V11))); } else { config.inConfs[TARGET_SHAPE_ID].setMemDesc( - creatorsMap.at(LayoutType::ncsp)->createSharedDesc(targetShapeType, getInputShapeAtPort(TARGET_SHAPE_ID))); - config.inConfs[get_scale_id()].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(get_scale_id()))); + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(targetShapeType, getInputShapeAtPort(TARGET_SHAPE_ID))); + config.inConfs[get_scale_id()].setMemDesc( + creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(get_scale_id()))); if (isAxesSpecified) - config.inConfs[get_axis_id()].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(get_axis_id()))); + config.inConfs[get_axis_id()].setMemDesc( + creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(get_axis_id()))); } - config.outConfs[0].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0))); + config.outConfs[0].setMemDesc( + creatorsMap.at(dataFormat)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0))); if (useAclExecutor) { std::vector srcMemoryDescs; @@ -2106,8 +2201,11 @@ void Interpolate::initSupportedPrimitiveDescriptors() { dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); } - auto factory = std::make_shared(interpAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getImplPriority())); + auto factory = std::make_shared( + interpAttrs, + srcMemoryDescs, + dstMemoryDescs, + std::make_shared(context, getImplPriority())); if (!factory->isEmpty()) { supportedPrimitiveDescriptors.push_back({config, implDetail, factory}); } @@ -2116,14 +2214,14 @@ void Interpolate::initSupportedPrimitiveDescriptors() { } }; if (is_version11) { -#if defined (OV_CPU_WITH_ACL) +#if defined(OV_CPU_WITH_ACL) interpAttrs.hasPad = hasPad; pushDesc(LayoutType::nspc, undef, true, true); pushDesc(LayoutType::ncsp, undef, true, true); canUseAclExecutor = !supportedPrimitiveDescriptors.empty(); if (canUseAclExecutor) return; - //fallback to f32 if ref is used + // fallback to f32 if ref is used inputPrecision = outputPrecision = ov::element::f32; #endif @@ -2147,17 +2245,17 @@ void Interpolate::initSupportedPrimitiveDescriptors() { } pushDesc(LayoutType::ncsp, ref, true); } else { - const auto &dataMinDims = getInputShapeAtPort(DATA_ID).getMinDims(); + const auto& dataMinDims = getInputShapeAtPort(DATA_ID).getMinDims(); bool isBlkApplied = dataRank > 1 && dataMinDims[1] != Shape::UNDEFINED_DIM && dataMinDims[1] > 1; -#if defined (OV_CPU_WITH_ACL) +#if defined(OV_CPU_WITH_ACL) interpAttrs.hasPad = hasPad; pushDesc(LayoutType::nspc, undef, false, true); pushDesc(LayoutType::ncsp, undef, false, true); canUseAclExecutor = !supportedPrimitiveDescriptors.empty(); if (canUseAclExecutor) return; - //fallback to f32 if ref is used + // fallback to f32 if ref is used inputPrecision = outputPrecision = ov::element::f32; #endif @@ -2202,7 +2300,7 @@ bool Interpolate::needShapeInfer() const { if (lastScales.empty()) { return true; } - const float *scales = getSrcDataAtPortAs(get_scale_id()); + const float* scales = getSrcDataAtPortAs(get_scale_id()); for (size_t i = 0; i < lastScales.size(); i++) { if (lastScales[i] != scales[i]) { return true; @@ -2212,7 +2310,7 @@ bool Interpolate::needShapeInfer() const { if (lastSizes.empty()) { return true; } - const int32_t *sizes = getSrcDataAtPortAs(TARGET_SHAPE_ID); + const int32_t* sizes = getSrcDataAtPortAs(TARGET_SHAPE_ID); for (size_t i = 0; i < lastSizes.size(); i++) { if (sizes[i] != lastSizes[i]) { return true; @@ -2226,12 +2324,12 @@ void Interpolate::executeDynamicImpl(dnnl::stream strm) { execute(strm); const size_t port = interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::sizes ? TARGET_SHAPE_ID : get_scale_id(); - const auto &memory = getParentEdgeAt(port)->getMemory(); + const auto& memory = getParentEdgeAt(port)->getMemory(); if (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales) { - const float *scales = memory.getDataAs(); + const float* scales = memory.getDataAs(); lastScales.assign(scales, scales + memory.getDesc().getShape().getElementsCount()); } else { - const int32_t *sizes = memory.getDataAs(); + const int32_t* sizes = memory.getDataAs(); lastSizes.assign(sizes, sizes + memory.getDesc().getShape().getElementsCount()); } } @@ -2284,19 +2382,19 @@ void Interpolate::prepareParams() { OPENVINO_THROW(errorPrefix, " has undefined axes memory"); } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) OPENVINO_THROW(errorPrefix, " did not set preferable primitive descriptor"); - const auto &srcDimsOrign = srcMemPtr->getStaticDims(); - const auto &dstDimsOrign = dstMemPtr->getStaticDims(); + const auto& srcDimsOrign = srcMemPtr->getStaticDims(); + const auto& dstDimsOrign = dstMemPtr->getStaticDims(); VectorDims srcDims = srcDimsOrign; VectorDims dstDims = dstDimsOrign; // layoutAlignment if (NCHWAsNHWC && srcMemPtr->getDesc().hasLayoutType(LayoutType::ncsp)) { - auto logicalShapeAlign = [] (VectorDims& Dims) { + auto logicalShapeAlign = [](VectorDims& Dims) { size_t C = Dims[3]; Dims[3] = Dims[2]; Dims[2] = Dims[1]; @@ -2315,7 +2413,8 @@ void Interpolate::prepareParams() { } } - std::vector dataScales = getScales(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd), dstDims); + std::vector dataScales = + getScales(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd), dstDims); if (!NCHWAsNHWC && (getOutputShapeAtPort(0).getRank() > 2 && (dataScales[0] != 1.f || dataScales[1] != 1.f))) { OPENVINO_THROW("Interpolate layer only supports resize on spatial dimensions(depth, height and width)"); } @@ -2331,7 +2430,10 @@ void Interpolate::prepareParams() { dstMemoryDescs.push_back(getDstMemoryAtPort(0)->getDescPtr()); auto selectedPD = getSelectedPrimitiveDescriptor(); - aclExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(interpAttrs, srcMemoryDescs, dstMemoryDescs, {}); + aclExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(interpAttrs, + srcMemoryDescs, + dstMemoryDescs, + {}); selectedPD->setImplementationType(aclExecPtr->getImplType()); return; @@ -2343,26 +2445,25 @@ void Interpolate::prepareParams() { auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr { std::shared_ptr executor; if ((key.nodeAttrs.mode == InterpolateMode::nearest || key.nodeAttrs.mode == InterpolateMode::linear_onnx || - key.nodeAttrs.mode == InterpolateMode::cubic) && + key.nodeAttrs.mode == InterpolateMode::cubic) && ((key.nodeAttrs.layout != InterpolateLayoutType::planar && mayiuse(cpu::x64::sse41)) || - (mayiuse(cpu::x64::avx2) && key.nodeAttrs.inPrc == ov::element::f32))) { + (mayiuse(cpu::x64::avx2) && key.nodeAttrs.inPrc == ov::element::f32))) { executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales, - key.attr); - } else if ((key.nodeAttrs.mode == InterpolateMode::bilinear_pillow || key.nodeAttrs.mode == InterpolateMode::bicubic_pillow) && - (key.nodeAttrs.layout == InterpolateLayoutType::by_channel)) { + key.srcDims, + key.dstDims, + key.dataScales, + key.attr); + } else if ((key.nodeAttrs.mode == InterpolateMode::bilinear_pillow || + key.nodeAttrs.mode == InterpolateMode::bicubic_pillow) && + (key.nodeAttrs.layout == InterpolateLayoutType::by_channel)) { executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales, - key.attr); + key.srcDims, + key.dstDims, + key.dataScales, + key.attr); } else { - executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales); + executor = + std::make_shared(key.nodeAttrs, key.srcDims, key.dstDims, key.dataScales); } return executor; }; @@ -2409,18 +2510,18 @@ static inline float triangleCoeff(float x) { return (std::max)(0.0f, 1 - std::abs(x)); } -void Interpolate::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims) { +void Interpolate::setPostOps(dnnl::primitive_attr& attr, const VectorDims& dims) { dnnl::post_ops ops; postOpsDataPtrs.clear(); - for (auto &node : fusedWith) { - auto* fakeQuantizeNode = dynamic_cast(node.get()); + for (auto& node : fusedWith) { + auto* fakeQuantizeNode = dynamic_cast(node.get()); if (fakeQuantizeNode) { fakeQuantizeNode->appendPostOps(ops, {}, postOpsDataPtrs); continue; } - auto* eltwiseNode = dynamic_cast(node.get()); + auto* eltwiseNode = dynamic_cast(node.get()); if (eltwiseNode) { eltwiseNode->appendPostOps(ops, dims, postOpsDataPtrs); continue; @@ -2436,9 +2537,9 @@ void Interpolate::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims) attr.set_post_ops(ops); } -VectorDims Interpolate::getPaddedInputShape(const VectorDims &srcDims, - const std::vector &padBegin, - const std::vector &padEnd) { +VectorDims Interpolate::getPaddedInputShape(const VectorDims& srcDims, + const std::vector& padBegin, + const std::vector& padEnd) { VectorDims paddedShape; int dataRank = srcDims.size(); for (int i = 0; i < dataRank; i++) { @@ -2450,18 +2551,21 @@ VectorDims Interpolate::getPaddedInputShape(const VectorDims &srcDims, // get scales of data rank size // if "scale" version: set scales with input scales, 1.f for other dims not in axis // if "size" version: scales = shape[target] / shape[input].pad, 1.f for other dims not in axis -// scales is a required input, but should not use input scales when "size" case, which may added eps or is a dummy value, recalculate scales instead. -std::vector Interpolate::getScales(const VectorDims &srcDimPad, const VectorDims &dstDim) { +// scales is a required input, but should not use input scales when "size" case, which may added eps or is a dummy +// value, recalculate scales instead. +std::vector Interpolate::getScales(const VectorDims& srcDimPad, const VectorDims& dstDim) { std::vector fullScales(dataRank, 1.f); const size_t axesRank = axes.size(); for (size_t i = 0; i < axesRank; i++) { int axis = axes[i]; // pillow always re-generate scales with input and output shape - if (interpAttrs.mode == InterpolateMode::bilinear_pillow || interpAttrs.mode == InterpolateMode::bicubic_pillow) { + if (interpAttrs.mode == InterpolateMode::bilinear_pillow || + interpAttrs.mode == InterpolateMode::bicubic_pillow) { fullScales[axis] = static_cast(dstDim[axis]) / static_cast(srcDimPad[axis]); } else { - fullScales[axis] = (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales) ? scales[i] : - static_cast(dstDim[axis]) / static_cast(srcDimPad[axis]); + fullScales[axis] = (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales) + ? scales[i] + : static_cast(dstDim[axis]) / static_cast(srcDimPad[axis]); } } return fullScales; @@ -2472,12 +2576,12 @@ void Interpolate::execute(dnnl::stream strm) { auto srcMemPtr = getSrcMemoryAtPort(DATA_ID); if (execPtr) { - uint8_t *dst_data = dstMemPtr->getDataAs(); - const uint8_t *src_data_origin = srcMemPtr->getDataAs(); - const uint8_t *src_data = nullptr; + uint8_t* dst_data = dstMemPtr->getDataAs(); + const uint8_t* src_data_origin = srcMemPtr->getDataAs(); + const uint8_t* src_data = nullptr; std::vector srcPadded; if (hasPad) { - const auto &srcDim = srcMemPtr->getStaticDims(); + const auto& srcDim = srcMemPtr->getStaticDims(); auto srcDimPad = execPtr->getSrcDimPad5d(); size_t dimSize = srcDim.size(); @@ -2496,23 +2600,34 @@ void Interpolate::execute(dnnl::stream strm) { if (interpAttrs.layout == InterpolateLayoutType::planar) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) { - const uint8_t *src = src_data_origin + - (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + - inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize; + const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + + inShapeBlock[3] * d + inShapeBlock[4] * h) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + + inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); }); src_data = src_data_pad; } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + - (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) + - inShapePadBlock[4] * (h + padB3) + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize; + const uint8_t* src = + src_data_origin + + (inShapeBlock[1] * n + + (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + (inShapePadBlock[1] * (n + padB0) + + (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + + inShapePadBlock[5] * (w + padB4)) * + srcDimPad5d[1] + + padB1) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); }); src_data = src_data_pad; @@ -2521,25 +2636,34 @@ void Interpolate::execute(dnnl::stream strm) { size_t CB = div_up(srcDimPad5d[1], blkSize); size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize; srcPadded.resize(eltsTotal * srcDataSize, 0x0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) { OPENVINO_THROW("Interpolate layer with name '", getName(), "' does not support padding on batch and channel dimensions"); } - parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) { - const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (h * srcDim5d[4] * blkSize) * srcDataSize - + (w * blkSize) * srcDataSize; - uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize - + ((w + padB4) * blkSize) * srcDataSize; - cpu_memcpy(srcPad, src, blkSize * srcDataSize); - }); + parallel_for5d(srcDim5d[0], + CB, + srcDim5d[2], + srcDim5d[3], + srcDim5d[4], + [&](int n, int cb, int d, int h, int w) { + const uint8_t* src = + src_data_origin + + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize; + uint8_t* srcPad = + src_data_pad + + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * + srcDataSize + + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize + + ((w + padB4) * blkSize) * srcDataSize; + cpu_memcpy(srcPad, src, blkSize * srcDataSize); + }); src_data = src_data_pad; } } else { @@ -2556,26 +2680,35 @@ void Interpolate::execute(dnnl::stream strm) { // for ndhwc and nCdhw8c[16c] // input may be f32/bf16/int8, fused->output varies -void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { - int *index_d = static_cast(&auxTable[0]); - int *index_h = static_cast(&auxTable[OD]); - int *index_w = static_cast(&auxTable[OD + OH]); +void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + int* index_d = static_cast(&auxTable[0]); + int* index_h = static_cast(&auxTable[OD]); + int* index_w = static_cast(&auxTable[OD + OH]); bool is_nhwc = (configured_for_layout == by_channel); for (int b = 0; b < B; b++) { if (is_nhwc) { - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b) * dstDataSize; + const uint8_t* in_ptr = in_ptr_ + (IW * IH * ID * C * b) * srcDataSize; + uint8_t* out_ptr = out_ptr_ + (OW * OH * OD * C * b) * dstDataSize; std::vector index_w_kernel(OW); for (int ox = 0; ox < OW; ox++) { index_w_kernel[ox] = index_w[ox] * C * srcDataSize; } parallel_for2d(OD, OH, [&](size_t d, size_t h) { // kernel for C * OW - uint8_t *out_ptr_dh = out_ptr + (C * OW * OH * d + C * OW * h) * dstDataSize; - const uint8_t *in_ptr_dh = in_ptr + (C * IW * IH * index_d[d] + C * IW * index_h[h]) * srcDataSize; + uint8_t* out_ptr_dh = out_ptr + (C * OW * OH * d + C * OW * h) * dstDataSize; + const uint8_t* in_ptr_dh = in_ptr + (C * IW * IH * index_d[d] + C * IW * index_h[h]) * srcDataSize; auto arg = jit_interpolate_call_args(); arg.dst = out_ptr_dh; arg.src_ptr[0] = in_ptr_dh; @@ -2588,15 +2721,16 @@ void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t *in_ptr_, ui } else { // for blk int blk_size = mayiuse(cpu::x64::avx512_core) ? 16 : 8; int CB = div_up(C, blk_size); - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize; + const uint8_t* in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize; + uint8_t* out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize; std::vector index_w_kernel(OW); for (int ox = 0; ox < OW; ox++) { index_w_kernel[ox] = index_w[ox] * blk_size * srcDataSize; } parallel_for2d(CB, OD, [&](size_t cb, size_t d) { - uint8_t *out_ptr_cbd = out_ptr + (blk_size * OW * OH * OD * cb + blk_size * OW * OH * d) * dstDataSize; - const uint8_t *in_ptr_cbd = in_ptr + (blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d]) * srcDataSize; + uint8_t* out_ptr_cbd = out_ptr + (blk_size * OW * OH * OD * cb + blk_size * OW * OH * d) * dstDataSize; + const uint8_t* in_ptr_cbd = + in_ptr + (blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d]) * srcDataSize; auto arg = jit_interpolate_call_args(); for (int h = 0; h < OH; h++) { // kernel for blk_size * OW arg.dst = out_ptr_cbd + blk_size * OW * h * dstDataSize; @@ -2612,11 +2746,20 @@ void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t *in_ptr_, ui } // batch end } -void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { - int *index_d = static_cast(&auxTable[0]); - int *index_h = static_cast(&auxTable[OD]); - int *index_w = static_cast(&auxTable[OD + OH]); +void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + int* index_d = static_cast(&auxTable[0]); + int* index_h = static_cast(&auxTable[OD]); + int* index_w = static_cast(&auxTable[OD + OH]); std::vector index_kernel(OH + OW); // index_h * IW * srcDataSize to reduce and simplify redundant compute @@ -2629,13 +2772,15 @@ void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t *in_ptr_, uint8 } parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od) * dstDataSize; + const uint8_t* in_ptr = + in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]) * srcDataSize; + uint8_t* out_ptr = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od) * dstDataSize; auto arg = jit_interpolate_call_args(); arg.src_ptr[0] = in_ptr; arg.dst = out_ptr; - arg.index = static_cast(&index_kernel[0]); // need index_h and index_w in kernel, it's in continous memory so one param + arg.index = static_cast( + &index_kernel[0]); // need index_h and index_w in kernel, it's in continous memory so one param arg.oc_off = static_cast(c * sizeof(float)); // work_amount is OH(out loop) and OW(inner loop), can get in kernel from jcp. arg.post_op_data = post_ops_data_; @@ -2643,18 +2788,27 @@ void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t *in_ptr_, uint8 }); } -void Interpolate::InterpolateJitExecutor::linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, int B, int C, - int ID, int IH, int IW, int OD, int OH, int OW) { - // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 - // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 - int *index = static_cast(&auxTable[0]); +void Interpolate::InterpolateJitExecutor::linearOnnxPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, EndTopLeft:4, EndTopRight:5, + // EndBottomLeft:6, EndBottomRight:7 weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 + int* index = static_cast(&auxTable[0]); int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); - float *weight = reinterpret_cast(&auxTable[scratchLen]); + float* weight = reinterpret_cast(&auxTable[scratchLen]); parallel_for2d(B, C, [&](size_t b, size_t c) { - uint8_t *out_ptr_nc = out_ptr_ + (OH * OW * OD * C * b + OH * OW * OD * c) * dstDataSize; - const uint8_t *in_ptr_nc = in_ptr_ + (IH * IW * ID * C * b + IH * IW * ID * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OH * OW * OD * C * b + OH * OW * OD * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (IH * IW * ID * C * b + IH * IW * ID * c) * srcDataSize; auto arg = jit_interpolate_call_args(); arg.src_ptr[0] = in_ptr_nc; arg.index = static_cast(&index[0]); @@ -2667,8 +2821,17 @@ void Interpolate::InterpolateJitExecutor::linearOnnxPlanar(const uint8_t *in_ptr }); } -void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { // left:OW right:OW Top:OH Bottom:OH Front:OD End:OD std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); @@ -2703,18 +2866,18 @@ void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t *in_ int I2 = ID * I1; int I3 = CB * I2; parallel_for3d(B, OD, OH, [&](size_t b, size_t d, size_t h) { - uint8_t *out_ptr_ndh = out_ptr_ + (C3 * b + C1 * d + C0 * h) * dstDataSize; - - const uint8_t *in_ptr_n = in_ptr_ + (I3 * b) * srcDataSize; - const uint8_t *in_ptr_nf = in_ptr_n + (indexPtr[4][d] * I1) * srcDataSize; - const uint8_t *in_ptr_nft = in_ptr_nf + (indexPtr[2][h] * I0) * srcDataSize; - const uint8_t *in_ptr_nfb = in_ptr_nf + (indexPtr[3][h] * I0) * srcDataSize; - const uint8_t *in_ptr_ne = in_ptr_n + (indexPtr[5][d] * I1) * srcDataSize; - const uint8_t *in_ptr_net = in_ptr_ne + (indexPtr[2][h] * I0) * srcDataSize; - const uint8_t *in_ptr_neb = in_ptr_ne + (indexPtr[3][h] * I0) * srcDataSize; + uint8_t* out_ptr_ndh = out_ptr_ + (C3 * b + C1 * d + C0 * h) * dstDataSize; + + const uint8_t* in_ptr_n = in_ptr_ + (I3 * b) * srcDataSize; + const uint8_t* in_ptr_nf = in_ptr_n + (indexPtr[4][d] * I1) * srcDataSize; + const uint8_t* in_ptr_nft = in_ptr_nf + (indexPtr[2][h] * I0) * srcDataSize; + const uint8_t* in_ptr_nfb = in_ptr_nf + (indexPtr[3][h] * I0) * srcDataSize; + const uint8_t* in_ptr_ne = in_ptr_n + (indexPtr[5][d] * I1) * srcDataSize; + const uint8_t* in_ptr_net = in_ptr_ne + (indexPtr[2][h] * I0) * srcDataSize; + const uint8_t* in_ptr_neb = in_ptr_ne + (indexPtr[3][h] * I0) * srcDataSize; auto arg = jit_interpolate_call_args(); for (int w = 0; w < OW; ++w) { - uint8_t *out_ptr_ndhw = out_ptr_ndh + CGatherLen * w * dstDataSize; + uint8_t* out_ptr_ndhw = out_ptr_ndh + CGatherLen * w * dstDataSize; arg.src_ptr[0] = in_ptr_nft + (indexPtr[0][w] * CGatherLen) * srcDataSize; arg.src_ptr[1] = in_ptr_nft + (indexPtr[1][w] * CGatherLen) * srcDataSize; @@ -2739,13 +2902,20 @@ void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t *in_ }); } -void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { const int idxNum = 1; - int *xOrigin = static_cast(&auxTable[0]); - float *xFactor = reinterpret_cast(&auxTable[OW]); - int *yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); - float *yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); + int* xOrigin = static_cast(&auxTable[0]); + float* xFactor = reinterpret_cast(&auxTable[OW]); + int* yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); + float* yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); int blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; int CB = div_up(C, blkSize); @@ -2754,8 +2924,8 @@ void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t *in_ptr_, int workAmount = configured_for_layout == InterpolateLayoutType::by_channel ? C : CB; parallel_for3d(B, OH, OW, [&](size_t b, size_t h, size_t w) { - uint8_t *out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize; - const uint8_t *in_ptr_n = in_ptr_ + (IH * IW * CSize * b) * srcDataSize; + uint8_t* out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize; + const uint8_t* in_ptr_n = in_ptr_ + (IH * IW * CSize * b) * srcDataSize; std::vector kernelIndex(CUBIC_GRID_LEN * CUBIC_GRID_LEN); // 16 address offset to src(batch) or src(CB) int iy = yOrigin[h]; @@ -2770,41 +2940,48 @@ void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t *in_ptr_, } } auto arg = jit_interpolate_call_args(); - arg.dst = out_ptr_nhw; - arg.src_ptr[0] = in_ptr_n; - arg.index = static_cast(&kernelIndex[0]); - // 0 for weight_W, 1 for weight_H - arg.weight_ptr[0] = static_cast(&xFactor[w * CUBIC_GRID_LEN]); - arg.weight_ptr[1] = static_cast(&yFactor[h * CUBIC_GRID_LEN]); - - // for by channel, src + step, dst + step, process next step on continuous memory - // for blk, src + IW*IH*blkSize, dst + OW*OH*blkSize, process the blkSize on next CB - arg.work_amount = workAmount; - arg.oc_off = 0; - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); + arg.dst = out_ptr_nhw; + arg.src_ptr[0] = in_ptr_n; + arg.index = static_cast(&kernelIndex[0]); + // 0 for weight_W, 1 for weight_H + arg.weight_ptr[0] = static_cast(&xFactor[w * CUBIC_GRID_LEN]); + arg.weight_ptr[1] = static_cast(&yFactor[h * CUBIC_GRID_LEN]); + + // for by channel, src + step, dst + step, process next step on continuous memory + // for blk, src + IW*IH*blkSize, dst + OW*OH*blkSize, process the blkSize on next CB + arg.work_amount = workAmount; + arg.oc_off = 0; + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); }); } -void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { int tblAdvance = 0; - int *xOrigin = static_cast(&auxTable[tblAdvance]); + int* xOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OW; - float *xFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* xFactor = reinterpret_cast(&auxTable[tblAdvance]); tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&auxTable[tblAdvance]); + int* yOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OH; - float *yFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* yFactor = reinterpret_cast(&auxTable[tblAdvance]); tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&auxTable[tblAdvance]); + int* sequenceOH = static_cast(&auxTable[tblAdvance]); tblAdvance += OW * OH; - int *sequenceOW = static_cast(&auxTable[tblAdvance]); + int* sequenceOW = static_cast(&auxTable[tblAdvance]); parallel_for2d(B, C, [&](size_t n, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * n + IW * IH * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * n + OW * OH * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (IW * IH * C * n + IW * IH * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OW * OH * C * n + OW * OH * c) * dstDataSize; auto arg = jit_interpolate_call_args(); arg.dst = out_ptr_nc; @@ -2822,8 +2999,15 @@ void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t *in_ptr_, ui }); } -void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { // workBuffer needed when both pass are true bool xPass = IW != OW; bool yPass = IH != OH; @@ -2855,8 +3039,11 @@ void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t *in_ptr_ // ===================================================================================================================== // index layout: // d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1 -void Interpolate::InterpolateExecutorBase::buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) { +void Interpolate::InterpolateExecutorBase::buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode) { const int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; float fy = dataScales[dimSize - 2]; @@ -2888,80 +3075,91 @@ void Interpolate::InterpolateExecutorBase::buildTblNN(const VectorDims& srcDimPa // scale is float(outShape) / float(inShape) // strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline // the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode -float Interpolate::InterpolateExecutorBase::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const { +float Interpolate::InterpolateExecutorBase::coordTransToInput(int outCoord, + float scale, + int inShape, + int outShape) const { if (scale == 1.0f || (inShape == outShape)) { return outCoord; } switch (coordTransMode) { - case InterpolateCoordTransMode::half_pixel: { + case InterpolateCoordTransMode::half_pixel: { + return (outCoord + 0.5f) / scale - 0.5f; + break; + } + case InterpolateCoordTransMode::pytorch_half_pixel: { + if (outShape > 1) return (outCoord + 0.5f) / scale - 0.5f; - break; - } - case InterpolateCoordTransMode::pytorch_half_pixel: { - if (outShape > 1) - return (outCoord + 0.5f) / scale - 0.5f; - else - return 0; - break; - } - case InterpolateCoordTransMode::asymmetric: { - return static_cast(outCoord) / scale; - break; - } - case InterpolateCoordTransMode::tf_half_pixel_for_nn: { - return (outCoord + 0.5f) / scale; - break; - } - case InterpolateCoordTransMode::align_corners: { - if (outShape > 1) - return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); - else - return 0; - break; - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); - break; - } + else + return 0; + break; + } + case InterpolateCoordTransMode::asymmetric: { + return static_cast(outCoord) / scale; + break; + } + case InterpolateCoordTransMode::tf_half_pixel_for_nn: { + return (outCoord + 0.5f) / scale; + break; + } + case InterpolateCoordTransMode::align_corners: { + if (outShape > 1) + return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); + else + return 0; + break; + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); + break; + } } } -int Interpolate::InterpolateExecutorBase::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const { +int Interpolate::InterpolateExecutorBase::nearestRound(float originCoord, + bool isDownsample, + InterpolateNearestMode nearestMode) const { switch (nearestMode) { - case InterpolateNearestMode::round_prefer_floor: { - if (originCoord == (static_cast(originCoord) + 0.5f)) - return static_cast(std::floor(originCoord)); - else - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::round_prefer_ceil: { - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::floor: { + case InterpolateNearestMode::round_prefer_floor: { + if (originCoord == (static_cast(originCoord) + 0.5f)) return static_cast(std::floor(originCoord)); - break; - } - case InterpolateNearestMode::ceil: { + else + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::round_prefer_ceil: { + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::floor: { + return static_cast(std::floor(originCoord)); + break; + } + case InterpolateNearestMode::ceil: { + return static_cast(std::ceil(originCoord)); + break; + } + case InterpolateNearestMode::simple: { + if (isDownsample) return static_cast(std::ceil(originCoord)); - break; - } - case InterpolateNearestMode::simple: { - if (isDownsample) - return static_cast(std::ceil(originCoord)); - else - return static_cast(originCoord); - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); - break; - } + else + return static_cast(originCoord); + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); + break; + } } } -void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, float scale, int inShape, int outShape, - int& index0, int& index1, float& weight0, float& weight1) { +void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1) { float inCoord = coordTransToInput(outCoord, scale, inShape, outShape); inCoord = std::max(0.0f, std::min(inCoord, static_cast(inShape - 1))); index0 = std::min(static_cast(inCoord), inShape - 1); @@ -2975,8 +3173,10 @@ void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, float scal } } -void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout) { +void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout) { int dimSize = dataRank; float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f; float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f; @@ -3035,7 +3235,7 @@ void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale; weightPtr[0][idxOzOyOx] = weightL; weightPtr[1][idxOzOyOx] = weightR; - if (spatialDimSize > 1) { + if (spatialDimSize > 1) { indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale; indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale; weightPtr[2][idxOzOyOx] = weightT; @@ -3088,8 +3288,11 @@ void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& // wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw // | | // wh0.....wh_diameter ih0.....ih_diameter -void Interpolate::InterpolateExecutorBase::buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, int kernel_width, bool antialias) { +void Interpolate::InterpolateExecutorBase::buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, + bool antialias) { int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; float fy = dataScales[dimSize - 2]; @@ -3113,15 +3316,15 @@ void Interpolate::InterpolateExecutorBase::buildTblLinear(const VectorDims& srcD int sizeOH = OH * diaOH; int sizeOW = OW * diaOW; auxTable.resize((sizeOD + sizeOH + sizeOW) * 2); - float *weightTable = reinterpret_cast(&auxTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + float* weightTable = reinterpret_cast(&auxTable[0]); + float* weightOD = static_cast(&weightTable[0]); + float* weightOH = static_cast(&weightTable[sizeOD]); + float* weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - int *idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + int* idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); + int* idxOD = static_cast(&idxTable[0]); + int* idxOH = static_cast(&idxTable[sizeOD]); + int* idxOW = static_cast(&idxTable[sizeOD + sizeOH]); for (size_t oz = 0; oz < OD; oz++) { float iz = coordTransToInput(oz, fz, ID, OD); @@ -3179,8 +3382,11 @@ std::vector Interpolate::InterpolateExecutorBase::getCubicCoeffs(float ma // table layout: // OW OW OW OW OW OH OH OH OH OH // x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3 -void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout) { +void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout) { int dimSize = dataRank; float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; @@ -3198,9 +3404,9 @@ void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDi } int tblAdvance = 0; - int *xOrigin = static_cast(&auxTable[tblAdvance]); + int* xOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OW; - float *xFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* xFactor = reinterpret_cast(&auxTable[tblAdvance]); for (int ox = 0; ox < OW; ox++) { float ix = coordTransToInput(ox, fx, IW, OW); int ix_r = static_cast(std::floor(ix)); @@ -3214,9 +3420,9 @@ void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDi } tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&auxTable[tblAdvance]); + int* yOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OH; - float *yFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* yFactor = reinterpret_cast(&auxTable[tblAdvance]); for (int oy = 0; oy < OH; oy++) { float iy = coordTransToInput(oy, fy, IH, OH); int iy_r = static_cast(std::floor(iy)); @@ -3231,9 +3437,9 @@ void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDi if (layout == InterpolateLayoutType::planar) { tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&auxTable[tblAdvance]); + int* sequenceOH = static_cast(&auxTable[tblAdvance]); tblAdvance += OH * OW; - int *sequenceOW = static_cast(&auxTable[tblAdvance]); + int* sequenceOW = static_cast(&auxTable[tblAdvance]); for (int h = 0; h < OH; ++h) { int offset = h * OW; for (int w = 0; w < OW; ++w) { @@ -3263,8 +3469,11 @@ float Interpolate::InterpolateExecutorBase::getPillowBicubicCoeffs(float m) { return 0.0f; } -void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout) { +void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout) { int dimSize = dataRank; float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; @@ -3279,15 +3488,15 @@ void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcD }; // pillowScale: e.g. 2.0 means down sample 2 times - auto generateArgs = [&] (float pillowScale) -> filterArgs { + auto generateArgs = [&](float pillowScale) -> filterArgs { filterArgs args; float scaleClip = pillowScale < 1.0f ? 1.0f : pillowScale; args.ScaleClipReciprocal = 1.0f / scaleClip; - args.filterRadius = (mode == InterpolateMode::bilinear_pillow) ? PILLOW_BILINEAR_WINDOW_SCALE * scaleClip : - PILLOW_BICUBIC_WINDOW_SCALE * scaleClip; + args.filterRadius = (mode == InterpolateMode::bilinear_pillow) ? PILLOW_BILINEAR_WINDOW_SCALE * scaleClip + : PILLOW_BICUBIC_WINDOW_SCALE * scaleClip; args.filterLen = static_cast(std::ceil(args.filterRadius) * 2 + 1); - args.weightGen = (mode == InterpolateMode::bilinear_pillow) ? this->getPillowBilinearCoeffs: - this->getPillowBicubicCoeffs; + args.weightGen = + (mode == InterpolateMode::bilinear_pillow) ? this->getPillowBilinearCoeffs : this->getPillowBicubicCoeffs; return args; }; @@ -3302,15 +3511,15 @@ void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcD auxTable[offset] = filterArgsX.filterLen; auxTable[offset + 1] = filterArgsY.filterLen; offset += 2; - float *weightX = reinterpret_cast(&auxTable[offset]); + float* weightX = reinterpret_cast(&auxTable[offset]); offset += filterArgsX.filterLen * OW; - float *weightY = reinterpret_cast(&auxTable[offset]); + float* weightY = reinterpret_cast(&auxTable[offset]); offset += filterArgsY.filterLen * OH; - int *indexX = static_cast(&auxTable[offset]); + int* indexX = static_cast(&auxTable[offset]); offset += 2 * OW; - int *indexY = static_cast(&auxTable[offset]); + int* indexY = static_cast(&auxTable[offset]); - auto generateTbl = [&] (int inLen, int outLen, float fScale, filterArgs args, float* weightTbl, int* idxTbl) { + auto generateTbl = [&](int inLen, int outLen, float fScale, filterArgs args, float* weightTbl, int* idxTbl) { int min = 0; int max = 0; for (int ox = 0; ox < outLen; ox++) { @@ -3354,21 +3563,29 @@ void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcD generateTbl(IH, OH, fy, filterArgsY, weightY, indexY); } -void Interpolate::InterpolateRefExecutor::NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - int OD, int OH, int OW) { - int *index_d = static_cast(&auxTable[0]); - int *index_h = static_cast(&auxTable[OD]); - int *index_w = static_cast(&auxTable[OD + OH]); - - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); +void Interpolate::InterpolateRefExecutor::NNRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + int* index_d = static_cast(&auxTable[0]); + int* index_h = static_cast(&auxTable[OD]); + int* index_w = static_cast(&auxTable[OD + OH]); + + const float* in_ptr_f32 = reinterpret_cast(in_ptr_); + float* out_ptr_f32 = reinterpret_cast(out_ptr_); parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { - const float *in_ptr = in_ptr_f32 + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]); - float *out_ptr = out_ptr_f32 + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od); + const float* in_ptr = in_ptr_f32 + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]); + float* out_ptr = out_ptr_f32 + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od); for (int oh = 0; oh < OH; oh++) { - const float *in_ptr_h = in_ptr + (IW * index_h[oh]); - float *out_ptr_h = out_ptr + (OW * oh); + const float* in_ptr_h = in_ptr + (IW * index_h[oh]); + float* out_ptr_h = out_ptr + (OW * oh); for (int ow = 0; ow < OW; ow++) { out_ptr_h[ow] = in_ptr_h[index_w[ow]]; } @@ -3376,8 +3593,16 @@ void Interpolate::InterpolateRefExecutor::NNRef(const uint8_t *in_ptr_, uint8_t }); } -void Interpolate::InterpolateRefExecutor::linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - int OD, int OH, int OW) { +void Interpolate::InterpolateRefExecutor::linearOnnxRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, @@ -3406,87 +3631,87 @@ void Interpolate::InterpolateRefExecutor::linearOnnxRef(const uint8_t *in_ptr_, weightPtr[5] = reinterpret_cast(&auxTable[scratchLen + 5 * OW * OH * OD]); } - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); + const float* in_ptr_f32 = reinterpret_cast(in_ptr_); + float* out_ptr_f32 = reinterpret_cast(out_ptr_); parallel_for2d(B, C, [&](size_t b, size_t c) { - float *out_ptr_nc = out_ptr_f32 + (OD * OH * OW * C * b + OD * OH * OW * c); - const float *in_ptr_nc = in_ptr_f32 + (ID * IH * IW * C * b + ID * IH * IW * c); + float* out_ptr_nc = out_ptr_f32 + (OD * OH * OW * C * b + OD * OH * OW * c); + const float* in_ptr_nc = in_ptr_f32 + (ID * IH * IW * C * b + ID * IH * IW * c); // do not combined 1d/2d to 3d unified process to get rid of invalid computing. switch (spatialDimSize) { - case 1: - for (int i = 0; i < OW; i++) { - float src0 = in_ptr_nc[indexPtr[0][i]]; - float src1 = in_ptr_nc[indexPtr[1][i]]; + case 1: + for (int i = 0; i < OW; i++) { + float src0 = in_ptr_nc[indexPtr[0][i]]; + float src1 = in_ptr_nc[indexPtr[1][i]]; - out_ptr_nc[i] = src0 * weightPtr[0][i] + - src1 * weightPtr[1][i]; - } - break; - case 2: - for (int i = 0; i < OH * OW; i++) { - float src00 = in_ptr_nc[indexPtr[0][i]]; - float src01 = in_ptr_nc[indexPtr[1][i]]; - float src10 = in_ptr_nc[indexPtr[2][i]]; - float src11 = in_ptr_nc[indexPtr[3][i]]; - - out_ptr_nc[i] = src00 * weightPtr[2][i] * weightPtr[0][i] + - src01 * weightPtr[2][i] * weightPtr[1][i] + - src10 * weightPtr[3][i] * weightPtr[0][i] + - src11 * weightPtr[3][i] * weightPtr[1][i]; - } - break; - case 3: - for (int i = 0; i < OD * OH * OW; i++) { - float src000 = in_ptr_nc[indexPtr[0][i]]; - float src001 = in_ptr_nc[indexPtr[1][i]]; - float src010 = in_ptr_nc[indexPtr[2][i]]; - float src011 = in_ptr_nc[indexPtr[3][i]]; - float src100 = in_ptr_nc[indexPtr[4][i]]; - float src101 = in_ptr_nc[indexPtr[5][i]]; - float src110 = in_ptr_nc[indexPtr[6][i]]; - float src111 = in_ptr_nc[indexPtr[7][i]]; - - // float dstValue = - // weightPtr[4][i] * weightPtr[2][i] * weightPtr[0][i] * src000 + - // weightPtr[4][i] * weightPtr[2][i] * weightPtr[1][i] * src001 + - // weightPtr[4][i] * weightPtr[3][i] * weightPtr[0][i] * src010 + - // weightPtr[4][i] * weightPtr[3][i] * weightPtr[1][i] * src011 + - // weightPtr[5][i] * weightPtr[2][i] * weightPtr[0][i] * src100 + - // weightPtr[5][i] * weightPtr[2][i] * weightPtr[1][i] * src101 + - // weightPtr[5][i] * weightPtr[3][i] * weightPtr[0][i] * src110 + - // weightPtr[5][i] * weightPtr[3][i] * weightPtr[1][i] * src111; - - out_ptr_nc[i] = - weightPtr[4][i] * (weightPtr[2][i] * (weightPtr[0][i] * src000 + - weightPtr[1][i] * src001) + - weightPtr[3][i] * (weightPtr[0][i] * src010 + - weightPtr[1][i] * src011)) + - weightPtr[5][i] * (weightPtr[2][i] * (weightPtr[0][i] * src100 + - weightPtr[1][i] * src101) + - weightPtr[3][i] * (weightPtr[0][i] * src110 + - weightPtr[1][i] * src111)); - } - break; - default: - break; + out_ptr_nc[i] = src0 * weightPtr[0][i] + src1 * weightPtr[1][i]; + } + break; + case 2: + for (int i = 0; i < OH * OW; i++) { + float src00 = in_ptr_nc[indexPtr[0][i]]; + float src01 = in_ptr_nc[indexPtr[1][i]]; + float src10 = in_ptr_nc[indexPtr[2][i]]; + float src11 = in_ptr_nc[indexPtr[3][i]]; + + out_ptr_nc[i] = src00 * weightPtr[2][i] * weightPtr[0][i] + src01 * weightPtr[2][i] * weightPtr[1][i] + + src10 * weightPtr[3][i] * weightPtr[0][i] + src11 * weightPtr[3][i] * weightPtr[1][i]; + } + break; + case 3: + for (int i = 0; i < OD * OH * OW; i++) { + float src000 = in_ptr_nc[indexPtr[0][i]]; + float src001 = in_ptr_nc[indexPtr[1][i]]; + float src010 = in_ptr_nc[indexPtr[2][i]]; + float src011 = in_ptr_nc[indexPtr[3][i]]; + float src100 = in_ptr_nc[indexPtr[4][i]]; + float src101 = in_ptr_nc[indexPtr[5][i]]; + float src110 = in_ptr_nc[indexPtr[6][i]]; + float src111 = in_ptr_nc[indexPtr[7][i]]; + + // float dstValue = + // weightPtr[4][i] * weightPtr[2][i] * weightPtr[0][i] * src000 + + // weightPtr[4][i] * weightPtr[2][i] * weightPtr[1][i] * src001 + + // weightPtr[4][i] * weightPtr[3][i] * weightPtr[0][i] * src010 + + // weightPtr[4][i] * weightPtr[3][i] * weightPtr[1][i] * src011 + + // weightPtr[5][i] * weightPtr[2][i] * weightPtr[0][i] * src100 + + // weightPtr[5][i] * weightPtr[2][i] * weightPtr[1][i] * src101 + + // weightPtr[5][i] * weightPtr[3][i] * weightPtr[0][i] * src110 + + // weightPtr[5][i] * weightPtr[3][i] * weightPtr[1][i] * src111; + + out_ptr_nc[i] = + weightPtr[4][i] * (weightPtr[2][i] * (weightPtr[0][i] * src000 + weightPtr[1][i] * src001) + + weightPtr[3][i] * (weightPtr[0][i] * src010 + weightPtr[1][i] * src011)) + + weightPtr[5][i] * (weightPtr[2][i] * (weightPtr[0][i] * src100 + weightPtr[1][i] * src101) + + weightPtr[3][i] * (weightPtr[0][i] * src110 + weightPtr[1][i] * src111)); + } + break; + default: + break; } }); } -void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { const int idxNum = 1; - int *xOrigin = static_cast(&auxTable[0]); - float *xFactor = reinterpret_cast(&auxTable[OW]); - int *yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); - float *yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); + int* xOrigin = static_cast(&auxTable[0]); + float* xFactor = reinterpret_cast(&auxTable[OW]); + int* yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); + float* yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); + const float* in_ptr_f32 = reinterpret_cast(in_ptr_); + float* out_ptr_f32 = reinterpret_cast(out_ptr_); parallel_for4d(B, C, OH, OW, [&](size_t n, size_t c, size_t oy, size_t ox) { - const float *in_ptr_nc = in_ptr_f32 + (IW * IH * C * n + IW * IH * c); - float *out_ptr_nc = out_ptr_f32 + (OW * OH * C * n + OW * OH * c); + const float* in_ptr_nc = in_ptr_f32 + (IW * IH * C * n + IW * IH * c); + float* out_ptr_nc = out_ptr_f32 + (OW * OH * C * n + OW * OH * c); int iy = yOrigin[oy]; int ix = xOrigin[ox]; @@ -3494,7 +3719,7 @@ void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t *in_ptr_, uint8 float retY = 0.f; for (int y = iy - 1, i = 0; y <= iy + 2; y++, i++) { int yInRange = std::max(0, std::min(y, IH - 1)); - const float *in_ptr_nch = in_ptr_nc + IW * yInRange; + const float* in_ptr_nch = in_ptr_nc + IW * yInRange; float retX = 0.f; for (int x = ix - 1, j = 0; x <= ix + 2; x++, j++) { int xInRange = std::max(0, std::min(x, IW - 1)); @@ -3506,66 +3731,79 @@ void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t *in_ptr_, uint8 }); } -float Interpolate::InterpolateRefExecutor::getValue(const uint8_t *base, size_t offset, ov::element::Type prec) { - const uint8_t *baseOffset = base + offset; +float Interpolate::InterpolateRefExecutor::getValue(const uint8_t* base, size_t offset, ov::element::Type prec) { + const uint8_t* baseOffset = base + offset; switch (prec) { - case ov::element::u8: { - return static_cast(*baseOffset); - break; - } - case ov::element::i8: { - const int8_t *valuePtr = reinterpret_cast(baseOffset); - return static_cast(*valuePtr); - break; - } - case ov::element::bf16: { - const uint16_t *valuePtr = reinterpret_cast(baseOffset); - return bfloat16_t::from_bits(*valuePtr); - break; - } - case ov::element::f32: { - const float *valuePtr = reinterpret_cast(baseOffset); - return *valuePtr; - break; - } - default: { - OPENVINO_THROW("Interpolate layer does not support precision: ", prec); - break; - } + case ov::element::u8: { + return static_cast(*baseOffset); + break; + } + case ov::element::i8: { + const int8_t* valuePtr = reinterpret_cast(baseOffset); + return static_cast(*valuePtr); + break; + } + case ov::element::bf16: { + const uint16_t* valuePtr = reinterpret_cast(baseOffset); + return bfloat16_t::from_bits(*valuePtr); + break; + } + case ov::element::f32: { + const float* valuePtr = reinterpret_cast(baseOffset); + return *valuePtr; + break; + } + default: { + OPENVINO_THROW("Interpolate layer does not support precision: ", prec); + break; + } } } -void Interpolate::InterpolateRefExecutor::setValue(uint8_t *base, size_t offset, float value, ov::element::Type prec) { - uint8_t *baseOffset = base + offset; +void Interpolate::InterpolateRefExecutor::setValue(uint8_t* base, size_t offset, float value, ov::element::Type prec) { + uint8_t* baseOffset = base + offset; switch (prec) { - case ov::element::u8: { - uint8_t data = static_cast(value < 0 ? 0 : value); - cpu_memcpy(baseOffset, &data, 1); - break; - } - case ov::element::i8: { - int8_t data = static_cast(value); - cpu_memcpy(baseOffset, &data, 1); - break; - } - case ov::element::bf16: { - uint16_t data = bfloat16_t(value).to_bits(); - cpu_memcpy(baseOffset, &data, 2); - break; - } - case ov::element::f32: { - cpu_memcpy(baseOffset, &value, sizeof(float)); - break; - } - default: { - OPENVINO_THROW("Interpolate layer does not support precision: ", prec); - break; - } + case ov::element::u8: { + uint8_t data = static_cast(value < 0 ? 0 : value); + cpu_memcpy(baseOffset, &data, 1); + break; + } + case ov::element::i8: { + int8_t data = static_cast(value); + cpu_memcpy(baseOffset, &data, 1); + break; + } + case ov::element::bf16: { + uint16_t data = bfloat16_t(value).to_bits(); + cpu_memcpy(baseOffset, &data, 2); + break; + } + case ov::element::f32: { + cpu_memcpy(baseOffset, &value, sizeof(float)); + break; + } + default: { + OPENVINO_THROW("Interpolate layer does not support precision: ", prec); + break; + } } } -void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias) { +void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + float fx, + float fy, + float fz, + int OD, + int OH, + int OW, + int kernel_width, + bool antialias) { if (IW == OW && IH == OH && ID == OD) { size_t spatialDimSize = IW * IH * ID; // TODO: enable when fusing into interp with linear mode will support @@ -3574,8 +3812,8 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ cpu_memcpy(out_ptr_, in_ptr_, size); } else { parallel_for2d(B, C, [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * dstDataSize; for (size_t i = 0; i < spatialDimSize; i++) { float dstValue = getValue(in_ptr_nc, i * srcDataSize, inputPrec); setValue(out_ptr_nc, i * dstDataSize, dstValue, outputPrec); @@ -3600,23 +3838,23 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ int sizeOH = OH * diaOH; int sizeOW = OW * diaOW; - float *weightTable = reinterpret_cast(&auxTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + float* weightTable = reinterpret_cast(&auxTable[0]); + float* weightOD = static_cast(&weightTable[0]); + float* weightOH = static_cast(&weightTable[sizeOD]); + float* weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - int *idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + int* idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); + int* idxOD = static_cast(&idxTable[0]); + int* idxOH = static_cast(&idxTable[sizeOD]); + int* idxOW = static_cast(&idxTable[sizeOD + sizeOH]); parallel_for2d(B, C, [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c) * dstDataSize; for (int oz = 0; oz < OD; oz++) { - uint8_t *out_ptr_ncd = out_ptr_nc + (OW * OH * oz) * dstDataSize; + uint8_t* out_ptr_ncd = out_ptr_nc + (OW * OH * oz) * dstDataSize; for (int oy = 0; oy < OH; oy++) { - uint8_t *out_ptr_ncdh = out_ptr_ncd + (OW * oy) * dstDataSize; + uint8_t* out_ptr_ncdh = out_ptr_ncd + (OW * oy) * dstDataSize; for (int ox = 0; ox < OW; ox++) { float sum = 0.f; float wsum = 0.f; @@ -3659,9 +3897,13 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ if (weightOW[ox * diaOW + ix] == 0.f) { continue; } - float w = weightOD[oz * diaOD + iz] * weightOH[oy * diaOH + iy] * weightOW[ox * diaOW + ix]; + float w = + weightOD[oz * diaOD + iz] * weightOH[oy * diaOH + iy] * weightOW[ox * diaOW + ix]; float value = getValue(in_ptr_nc, - (idxOD[oz * diaOD + iz] * IH * IW + idxOH[oy * diaOH + iy] * IW + idxOW[ox * diaOW + ix]) * srcDataSize, inputPrec); + (idxOD[oz * diaOD + iz] * IH * IW + idxOH[oy * diaOH + iy] * IW + + idxOW[ox * diaOW + ix]) * + srcDataSize, + inputPrec); sum += w * value; wsum += w; @@ -3681,18 +3923,25 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ }); } -void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { size_t offset = 0; int filterLenX = auxTable[offset]; int filterLenY = auxTable[offset + 1]; offset += 2; - float *weightX = reinterpret_cast(&auxTable[offset]); + float* weightX = reinterpret_cast(&auxTable[offset]); offset += filterLenX * OW; - float *weightY = reinterpret_cast(&auxTable[offset]); + float* weightY = reinterpret_cast(&auxTable[offset]); offset += filterLenY * OH; - int *indexX = static_cast(&auxTable[offset]); + int* indexX = static_cast(&auxTable[offset]); offset += 2 * OW; - int *indexY = static_cast(&auxTable[offset]); + int* indexY = static_cast(&auxTable[offset]); // workBuffer needed when both pass is true bool xPass = IW != OW; @@ -3710,21 +3959,24 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint // | | // ---- auto bc_loop = [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * b + IW * IH * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * b + OW * OH * c) * dstDataSize; - uint8_t *xpass_out_ptr_nc = nullptr; - const uint8_t *ypass_in_ptr_nc = nullptr; + const uint8_t* in_ptr_nc = in_ptr_ + (IW * IH * C * b + IW * IH * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OW * OH * C * b + OW * OH * c) * dstDataSize; + uint8_t* xpass_out_ptr_nc = nullptr; + const uint8_t* ypass_in_ptr_nc = nullptr; if (xPass && yPass) { size_t parallel_num = B * C; // IH * OW buf needed if (parallel_num < m_threads_num) { - xpass_out_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); - ypass_in_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); + xpass_out_ptr_nc = + static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); + ypass_in_ptr_nc = + static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); } else { size_t threadsIdx = parallel_get_thread_num(); size_t buffer_size = static_cast(OW * IH); xpass_out_ptr_nc = static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); - ypass_in_ptr_nc = static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); + ypass_in_ptr_nc = + static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); } } else if (xPass && !yPass) { xpass_out_ptr_nc = out_ptr_nc; @@ -3782,14 +4034,14 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint void Interpolate::InterpolateExecutorBase::create_pillow_working_buf(InterpolateLayoutType layout) { if (srcDimPad5d[3] == dstDim5d[3] || srcDimPad5d[4] == dstDim5d[4]) return; - size_t bufSize = srcDimPad5d[3] * dstDim5d[4] * srcDataSize; // IH * OW + size_t bufSize = srcDimPad5d[3] * dstDim5d[4] * srcDataSize; // IH * OW m_threads_num = parallel_get_max_threads(); if (layout == InterpolateLayoutType::planar) { // B and C execute in parallel, need separate buf size_t parallel_num = srcDimPad5d[0] * srcDimPad5d[1]; bufSize *= std::min(m_threads_num, parallel_num); } else { - bufSize *= srcDimPad5d[1]; // *C + bufSize *= srcDimPad5d[1]; // *C // B execute in parallel, need separate buf size_t parallel_num = srcDimPad5d[0]; bufSize *= std::min(m_threads_num, parallel_num); @@ -3798,11 +4050,14 @@ void Interpolate::InterpolateExecutorBase::create_pillow_working_buf(Interpolate } Interpolate::InterpolateExecutorBase::InterpolateExecutorBase(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales) : - mode(interpAttrs.mode), coordTransMode(interpAttrs.coordTransMode), configured_for_layout(interpAttrs.layout), - inputPrec(interpAttrs.inPrc), outputPrec(interpAttrs.outPrc) { + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales) + : mode(interpAttrs.mode), + coordTransMode(interpAttrs.coordTransMode), + configured_for_layout(interpAttrs.layout), + inputPrec(interpAttrs.inPrc), + outputPrec(interpAttrs.outPrc) { srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd)); dstDim5d = to5Dim(dstDims); srcDataSize = interpAttrs.inPrc.size(); @@ -3811,44 +4066,44 @@ Interpolate::InterpolateExecutorBase::InterpolateExecutorBase(const InterpolateA spatialDimSize = getSpatialDimsNum(dataRank); switch (mode) { - case InterpolateMode::nearest: { - buildTblNN(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout, interpAttrs.nearestMode); - break; - } - case InterpolateMode::linear_onnx: { - buildTblLinearOnnx(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout); - break; - } - case InterpolateMode::linear: { - static constexpr int LINEAR_KERNEL = 2; - buildTblLinear(srcDimPad5d, dstDim5d, dataScales, LINEAR_KERNEL, interpAttrs.antialias); - break; - } - case InterpolateMode::cubic: { - buildTblCubic(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); - break; - } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - buildTblPillow(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); - if ((srcDimPad5d[4] != dstDim5d[4]) && (srcDimPad5d[3] != dstDim5d[3])) { - create_pillow_working_buf(interpAttrs.layout); - } - break; - } - default: { - OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", mode); - break; + case InterpolateMode::nearest: { + buildTblNN(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout, interpAttrs.nearestMode); + break; + } + case InterpolateMode::linear_onnx: { + buildTblLinearOnnx(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout); + break; + } + case InterpolateMode::linear: { + static constexpr int LINEAR_KERNEL = 2; + buildTblLinear(srcDimPad5d, dstDim5d, dataScales, LINEAR_KERNEL, interpAttrs.antialias); + break; + } + case InterpolateMode::cubic: { + buildTblCubic(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + buildTblPillow(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); + if ((srcDimPad5d[4] != dstDim5d[4]) && (srcDimPad5d[3] != dstDim5d[3])) { + create_pillow_working_buf(interpAttrs.layout); } + break; + } + default: { + OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", mode); + break; + } } } Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales, - const dnnl::primitive_attr &attr) : - InterpolateExecutorBase(interpAttrs, srcDims, dstDims, dataScales) { + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales, + const dnnl::primitive_attr& attr) + : InterpolateExecutorBase(interpAttrs, srcDims, dstDims, dataScales) { auto jcp = jit_interpolate_config_params(); jcp.mode = mode; jcp.src_prc = interpAttrs.inPrc; @@ -3885,7 +4140,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt } else { OPENVINO_THROW("Can't create InterpolateJitExecutor"); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 if (interpolateKernel) { interpolateKernel->create_ker(); } else { @@ -3893,7 +4148,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt } } -void Interpolate::InterpolateJitExecutor::exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) { +void Interpolate::InterpolateJitExecutor::exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) { size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; @@ -3901,103 +4156,115 @@ void Interpolate::InterpolateJitExecutor::exec(const uint8_t *in_ptr_, uint8_t * OPENVINO_THROW("Can't execute, kernel for Interpolate node is not compiled"); } switch (mode) { - case InterpolateMode::nearest: { - if (configured_for_layout == InterpolateLayoutType::planar) { - NNPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } else { - NNCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } - break; - } - case InterpolateMode::linear_onnx: { - if (configured_for_layout == InterpolateLayoutType::planar) { - linearOnnxPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } else { - linearOnnxCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } - break; + case InterpolateMode::nearest: { + if (configured_for_layout == InterpolateLayoutType::planar) { + NNPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } else { + NNCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); } - case InterpolateMode::cubic: { - if (configured_for_layout == InterpolateLayoutType::planar) { - cubicPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } else { - cubicCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } - break; + break; + } + case InterpolateMode::linear_onnx: { + if (configured_for_layout == InterpolateLayoutType::planar) { + linearOnnxPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } else { + linearOnnxCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - if (configured_for_layout == InterpolateLayoutType::by_channel) { - pillowCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } else { - OPENVINO_THROW("Only channel_first jit kernel is supported for pillow mode", mode); - } - break; + break; + } + case InterpolateMode::cubic: { + if (configured_for_layout == InterpolateLayoutType::planar) { + cubicPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); + } else { + cubicCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); } - default: { - OPENVINO_THROW("InterpolateJitExecutor has unsupported interpolate mode: ", mode); + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + if (configured_for_layout == InterpolateLayoutType::by_channel) { + pillowCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); + } else { + OPENVINO_THROW("Only channel_first jit kernel is supported for pillow mode", mode); } + break; + } + default: { + OPENVINO_THROW("InterpolateJitExecutor has unsupported interpolate mode: ", mode); + } } } -void Interpolate::InterpolateRefExecutor::exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) { +void Interpolate::InterpolateRefExecutor::exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) { size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; switch (mode) { - case InterpolateMode::nearest: { - NNRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); - break; - } - case InterpolateMode::linear_onnx: { - linearOnnxRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); - break; - } - case InterpolateMode::cubic: { - cubicRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); - break; - } - case InterpolateMode::linear: { - float fz = (dataRank == 5) ? dataScales[dataRank - 3] : 1.f; - float fy = dataScales[dataRank - 2]; - float fx = dataScales[dataRank - 1]; - - bool isDownsample = (fx < 1.f) || (fy < 1.f) || (fz < 1.f); - int kernel_width = 2; - linearInterpolation(in_ptr_, out_ptr_, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias); - break; - } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - pillowRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); - break; - } - default: { - OPENVINO_THROW("Interpolate layer has unsupported interpolate mode: ", mode); - } + case InterpolateMode::nearest: { + NNRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); + break; + } + case InterpolateMode::linear_onnx: { + linearOnnxRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); + break; + } + case InterpolateMode::cubic: { + cubicRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); + break; + } + case InterpolateMode::linear: { + float fz = (dataRank == 5) ? dataScales[dataRank - 3] : 1.f; + float fy = dataScales[dataRank - 2]; + float fx = dataScales[dataRank - 1]; + + bool isDownsample = (fx < 1.f) || (fy < 1.f) || (fz < 1.f); + int kernel_width = 2; + linearInterpolation(in_ptr_, + out_ptr_, + N, + C, + ID, + IH, + IW, + fx, + fy, + fz, + OD, + OH, + OW, + kernel_width, + isDownsample && antialias); + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + pillowRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); + break; + } + default: { + OPENVINO_THROW("Interpolate layer has unsupported interpolate mode: ", mode); + } } } size_t Interpolate::getSpatialDimsNum(const Dim rank) { switch (rank) { - case 1: - case 3: - return 1; - case 2: - case 4: - return 2; - case 5: - return 3; - default: - OPENVINO_THROW("Can't define number spatial"); + case 1: + case 3: + return 1; + case 2: + case 4: + return 2; + case 5: + return 3; + default: + OPENVINO_THROW("Can't define number spatial"); } } bool Interpolate::canFuse(const NodePtr& node) const { - if (!mayiuse(cpu::x64::sse41) || - interpAttrs.mode == InterpolateMode::linear || - interpAttrs.mode == InterpolateMode::bilinear_pillow || - interpAttrs.mode == InterpolateMode::bicubic_pillow || + if (!mayiuse(cpu::x64::sse41) || interpAttrs.mode == InterpolateMode::linear || + interpAttrs.mode == InterpolateMode::bilinear_pillow || interpAttrs.mode == InterpolateMode::bicubic_pillow || (!one_of(dataRank, 4u, 5u) && !mayiuse(cpu::x64::avx2))) { return false; } @@ -4009,6 +4276,6 @@ bool Interpolate::created() const { return getType() == Type::Interpolate; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.h b/src/plugins/intel_cpu/src/nodes/interpolate.h index a43b354aa0306a..c6fedf384f449d 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.h +++ b/src/plugins/intel_cpu/src/nodes/interpolate.h @@ -31,34 +31,36 @@ struct jit_interpolate_config_params { }; struct jit_interpolate_call_args { - const void *src_ptr[MAX_INPUT_INTERPOLATE]; - const void *weight_ptr[MAX_INPUT_INTERPOLATE]; - const int *index; - void *dst; + const void* src_ptr[MAX_INPUT_INTERPOLATE]; + const void* weight_ptr[MAX_INPUT_INTERPOLATE]; + const int* index; + void* dst; size_t work_amount; size_t oc_off; - //ptr to array of post op inputs pointers (flat list) + // ptr to array of post op inputs pointers (flat list) const void* post_op_data; }; struct jit_uni_interpolate_kernel { - void (*ker_)(const jit_interpolate_call_args *); + void (*ker_)(const jit_interpolate_call_args*); - void operator()(const jit_interpolate_call_args *args) { + void operator()(const jit_interpolate_call_args* args) { assert(ker_); ker_(args); } - explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {} + explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const dnnl_primitive_attr& attr) + : ker_(nullptr), + jcp_(jcp), + attr_(attr) {} virtual ~jit_uni_interpolate_kernel() {} virtual void create_ker() = 0; jit_interpolate_config_params jcp_; - const dnnl_primitive_attr &attr_; + const dnnl_primitive_attr& attr_; }; - class Interpolate : public Node { public: static constexpr size_t DATA_ID = 0; @@ -98,8 +100,9 @@ class Interpolate : public Node { bool is_version11 = true; InterpolateAttrs interpAttrs; // Some FEs or preprocessing step resize spatial dimension for tensor with NHWC layout memory, - // but imported as planar layout[abcd] with axis[1,2] for convenience. In this case, for pillow modes without pad for now, - // nhwc layout path and the kernel(nhwc layout executor) can be used for this planar layout and axis settings(NCHWAsNHWC is true) to get higher perf with + // but imported as planar layout[abcd] with axis[1,2] for convenience. In this case, for pillow modes without pad + // for now, nhwc layout path and the kernel(nhwc layout executor) can be used for this planar layout and axis + // settings(NCHWAsNHWC is true) to get higher perf with // 1. logical shape alignment [abcd-nhwc] to [adbc-nchw]. // 2. axis alignment [1,2] to [2,3]. // 3. config planar layout support and treated it as channel_first layout. @@ -107,120 +110,226 @@ class Interpolate : public Node { size_t dataRank = 0; class InterpolateExecutorBase { - public: - InterpolateExecutorBase(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales); - - virtual void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) = 0; - virtual ~InterpolateExecutorBase() = default; - VectorDims getSrcDimPad5d() const { return srcDimPad5d; } - - private: - void buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout, InterpolateNearestMode nearestMode); - void buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout); - void buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, int kernel_width, - bool antialias); - void buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, float cubicCoeff, - InterpolateLayoutType layout); - void buildTblPillow(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout); - - float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; - int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; - void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1); - std::vector getCubicCoeffs(float mantissa, float a); - static float getPillowBilinearCoeffs(float m); - static float getPillowBicubicCoeffs(float m); - inline void create_pillow_working_buf(InterpolateLayoutType layout); - - protected: - InterpolateMode mode; - InterpolateCoordTransMode coordTransMode; - InterpolateLayoutType configured_for_layout; - VectorDims srcDimPad5d, dstDim5d; - ov::element::Type inputPrec, outputPrec; - size_t srcDataSize, dstDataSize; - int spatialDimSize; - size_t dataRank; - std::vector auxTable; - std::vector pillow_working_buf; - size_t m_threads_num = 0lu; + public: + InterpolateExecutorBase(const InterpolateAttrs& interpAttrs, + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales); + + virtual void exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) = 0; + virtual ~InterpolateExecutorBase() = default; + VectorDims getSrcDimPad5d() const { + return srcDimPad5d; + } + + private: + void buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode); + void buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout); + void buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, + bool antialias); + void buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout); + void buildTblPillow(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout); + + float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; + int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; + void linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1); + std::vector getCubicCoeffs(float mantissa, float a); + static float getPillowBilinearCoeffs(float m); + static float getPillowBicubicCoeffs(float m); + inline void create_pillow_working_buf(InterpolateLayoutType layout); + + protected: + InterpolateMode mode; + InterpolateCoordTransMode coordTransMode; + InterpolateLayoutType configured_for_layout; + VectorDims srcDimPad5d, dstDim5d; + ov::element::Type inputPrec, outputPrec; + size_t srcDataSize, dstDataSize; + int spatialDimSize; + size_t dataRank; + std::vector auxTable; + std::vector pillow_working_buf; + size_t m_threads_num = 0lu; }; std::shared_ptr execPtr = nullptr; class InterpolateJitExecutor : public InterpolateExecutorBase { - public: - InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales, - const dnnl::primitive_attr &attr); - - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; - - private: - // nearest neighbor - void NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - // onnx linear - void linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - // cubic - void cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - void cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - - // pillow bilinear and pillow bicubic - void pillowCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - - private: - std::shared_ptr interpolateKernel = nullptr; + public: + InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales, + const dnnl::primitive_attr& attr); + + void exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) override; + + private: + // nearest neighbor + void NNPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + void NNCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + + // onnx linear + void linearOnnxPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + void linearOnnxCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + + // cubic + void cubicPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW); + void cubicCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW); + + // pillow bilinear and pillow bicubic + void pillowCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW); + + private: + std::shared_ptr interpolateKernel = nullptr; }; class InterpolateRefExecutor : public InterpolateExecutorBase { - public: - InterpolateRefExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &_dataScales) : - InterpolateExecutorBase(interpAttrs, srcDims, dstDims, _dataScales), - antialias(interpAttrs.antialias), dataScales(_dataScales) {} - - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; - - private: - void NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - void cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW); - void linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias); - void pillowRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW); - - static float getValue(const uint8_t *base, size_t offset, ov::element::Type prec); - static void setValue(uint8_t *base, size_t offset, float value, ov::element::Type prec); - - private: - bool antialias; - std::vector dataScales; + public: + InterpolateRefExecutor(const InterpolateAttrs& interpAttrs, + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& _dataScales) + : InterpolateExecutorBase(interpAttrs, srcDims, dstDims, _dataScales), + antialias(interpAttrs.antialias), + dataScales(_dataScales) {} + + void exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) override; + + private: + void + NNRef(const uint8_t* in_ptr_, uint8_t* out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + void linearOnnxRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + + void cubicRef(const uint8_t* in_ptr_, uint8_t* out_ptr_, int B, int C, int IH, int IW, int OH, int OW); + void linearInterpolation(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + float fx, + float fy, + float fz, + int OD, + int OH, + int OW, + int kernel_width, + bool antialias); + void pillowRef(const uint8_t* in_ptr_, uint8_t* out_ptr_, int B, int C, int IH, int IW, int OH, int OW); + + static float getValue(const uint8_t* base, size_t offset, ov::element::Type prec); + static void setValue(uint8_t* base, size_t offset, float value, ov::element::Type prec); + + private: + bool antialias; + std::vector dataScales; }; - void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims); + void setPostOps(dnnl::primitive_attr& attr, const VectorDims& dims); - static VectorDims getPaddedInputShape(const VectorDims &srcDims, const std::vector &padBegin, const std::vector &padEnd); - std::vector getScales(const VectorDims &srcDimPad, const VectorDims &dstDim); + static VectorDims getPaddedInputShape(const VectorDims& srcDims, + const std::vector& padBegin, + const std::vector& padEnd); + std::vector getScales(const VectorDims& srcDimPad, const VectorDims& dstDim); static size_t getSpatialDimsNum(const Dim rank); bool hasPad = false; @@ -244,6 +353,6 @@ class Interpolate : public Node { std::shared_ptr aclExecPtr = nullptr; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 7ac3b603353541..4b4b07df572b4a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -12,9 +12,8 @@ using namespace Xbyak_aarch64; using namespace dnnl::impl::cpu; using namespace dnnl::impl::cpu::aarch64; -void jit_uni_eltwise_kernel::operator()( - const node::jit_eltwise_call_args_ptrs* const_args, - const jit_eltwise_call_args_indexes* indexes) { +void jit_uni_eltwise_kernel::operator()(const node::jit_eltwise_call_args_ptrs* const_args, + const jit_eltwise_call_args_indexes* indexes) { assert(ker_); ker_(const_args, indexes); } @@ -23,12 +22,12 @@ template jit_uni_eltwise_generic::jit_uni_eltwise_generic(const jit_eltwise_params& jep, const std::vector& eltwise_data, const std::vector& ops_list, - const dnnl::post_ops& post_ops) : - jit_uni_eltwise_kernel(jep), - jit_generator(), - eltwise_data_(eltwise_data), - ops_list_(ops_list), - post_ops_(post_ops) {} + const dnnl::post_ops& post_ops) + : jit_uni_eltwise_kernel(jep), + jit_generator(), + eltwise_data_(eltwise_data), + ops_list_(ops_list), + post_ops_(post_ops) {} template void jit_uni_eltwise_generic::generate() { @@ -41,7 +40,7 @@ void jit_uni_eltwise_generic::generate() { post_op_emitters.push_back(create_eltwise_emitter(eltwise_data_[i], exec_prc)); } - const auto &jep = jep_; + const auto& jep = jep_; XReg param2 = abi_param2; const int offset_count = jep.input_size - 1; @@ -49,10 +48,15 @@ void jit_uni_eltwise_generic::generate() { // ptrs initializing if (jep.use_runtime_ptrs) { for (size_t i = 0; i < jep.inputs_number; i++) { - ldr(start_to_offsets, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_offsets) + i * sizeof(size_t)))); - ldr(get_src_reg(i), ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr[0]) + i * sizeof(size_t)))); - XReg offset_reg = get_aux_gpr(0); // X_TMP_0; - XReg index_reg = get_aux_gpr(1); // X_TMP_1; + ldr(start_to_offsets, + ptr(reg_const_params, + static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_offsets) + + i * sizeof(size_t)))); + ldr(get_src_reg(i), + ptr(reg_const_params, + static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr[0]) + i * sizeof(size_t)))); + XReg offset_reg = get_aux_gpr(0); // X_TMP_0; + XReg index_reg = get_aux_gpr(1); // X_TMP_1; for (int j = 0; j < offset_count; j++) { ldr(offset_reg, ptr(start_to_offsets, static_cast(j * sizeof(size_t)))); ldr(index_reg, ptr(reg_indexes, static_cast(j * sizeof(size_t)))); @@ -60,10 +64,11 @@ void jit_uni_eltwise_generic::generate() { } } - ldr(start_to_offsets, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, dst_offsets)))); + ldr(start_to_offsets, + ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, dst_offsets)))); ldr(reg_dst, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, dst_ptr)))); - XReg offset_reg = get_aux_gpr(0); // X_TMP_0; - XReg index_reg = get_aux_gpr(1); // X_TMP_1; + XReg offset_reg = get_aux_gpr(0); // X_TMP_0; + XReg index_reg = get_aux_gpr(1); // X_TMP_1; for (int j = 0; j < offset_count; j++) { ldr(offset_reg, ptr(start_to_offsets, static_cast(j * sizeof(size_t)))); ldr(index_reg, ptr(reg_indexes, static_cast(j * sizeof(size_t)))); @@ -72,7 +77,8 @@ void jit_uni_eltwise_generic::generate() { mov(reg_oc_off, 0); - ldr(reg_work_amount, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, work_amount)))); + ldr(reg_work_amount, + ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, work_amount)))); } else { auto init_ptrs_with_offsets = [this, offset_count, param2](XReg pointer, const std::vector& offsets) { for (int j = 0; j < offset_count; j++) { @@ -88,7 +94,9 @@ void jit_uni_eltwise_generic::generate() { }; for (size_t i = 0; i < jep.inputs_number; i++) { - ldr(get_src_reg(i), ptr(param1, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t)))); + ldr(get_src_reg(i), + ptr(param1, + static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t)))); init_ptrs_with_offsets(get_src_reg(i), jep.src_offsets[i]); } @@ -149,7 +157,12 @@ void jit_uni_eltwise_generic::generate() { for (size_t j = 0; j < min_src_size / vec_step; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) { - load_vector(get_vmm_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, false, j * vec_step * jep.src_prc[i].size()); + load_vector(get_vmm_reg(i), + get_src_reg(i), + jep.src_prc[i], + exec_prc, + false, + j * vec_step * jep.src_prc[i].size()); } } @@ -164,7 +177,11 @@ void jit_uni_eltwise_generic::generate() { for (size_t j = tail_start; j < min_src_size; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) { - load_scalar(get_scl_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, j * jep.src_prc[i].size()); + load_scalar(get_scl_reg(i), + get_src_reg(i), + jep.src_prc[i], + exec_prc, + j * jep.src_prc[i].size()); } } @@ -276,7 +293,7 @@ namespace utils { template void load_vector(const T1& data_lane, const T2& data_lanes, - const Xbyak_aarch64::XReg &ptr_reg, + const Xbyak_aarch64::XReg& ptr_reg, const int64_t offset, const bool broadcast, jit_generator* h) { @@ -296,7 +313,7 @@ void load_vector(const T1& data_lane, } } } -} // namespace utils +} // namespace utils template void jit_uni_eltwise_generic::load_vector(const TReg& data, @@ -306,62 +323,63 @@ void jit_uni_eltwise_generic::load_vector(const TReg& data, const bool broadcast, const int32_t ptr_offset) { switch (src_prc) { - case ov::element::f16: { - utils::load_vector(data.h, data.h4, ptr_reg, ptr_offset, broadcast, this); - break; - } - case ov::element::f32: - case ov::element::i32: { - if (broadcast) { - jit_generator::uni_ld1rw(data.s, ptr_reg, ptr_offset); - } else { - jit_generator::uni_ldr(data, ptr_reg, ptr_offset); - } - break; - } - case ov::element::i8: { - utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); - sshll(data.h8, data.b8, 0); - sshll(data.s4, data.h4, 0); - break; - } - case ov::element::u8: { - utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); - ushll(data.h8, data.b8, 0); - ushll(data.s4, data.h4, 0); - break; - } - default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + case ov::element::f16: { + utils::load_vector(data.h, data.h4, ptr_reg, ptr_offset, broadcast, this); + break; + } + case ov::element::f32: + case ov::element::i32: { + if (broadcast) { + jit_generator::uni_ld1rw(data.s, ptr_reg, ptr_offset); + } else { + jit_generator::uni_ldr(data, ptr_reg, ptr_offset); } + break; + } + case ov::element::i8: { + utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); + sshll(data.h8, data.b8, 0); + sshll(data.s4, data.h4, 0); + break; + } + case ov::element::u8: { + utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); + ushll(data.h8, data.b8, 0); + ushll(data.s4, data.h4, 0); + break; + } + default: { + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + } } if (dst_prc != src_prc) { switch (dst_prc) { - case ov::element::f32: - switch (src_prc) { - case ov::element::f16: { - fcvtl(data.s4, data.h4); - break; - } - case ov::element::i32: { - scvtf(data.s, data.s); - break; - } - case ov::element::i8: { - scvtf(data.s, data.s); - break; - } - case ov::element::u8: { - ucvtf(data.s, data.s); - break; - } - default: - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); - } + case ov::element::f32: + switch (src_prc) { + case ov::element::f16: { + fcvtl(data.s4, data.h4); + break; + } + case ov::element::i32: { + scvtf(data.s, data.s); break; + } + case ov::element::i8: { + scvtf(data.s, data.s); + break; + } + case ov::element::u8: { + ucvtf(data.s, data.s); + break; + } default: - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + + dst_prc.to_string()); + } + break; + default: + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); } } } @@ -373,61 +391,62 @@ void jit_uni_eltwise_generic::load_scalar(const SReg& data, const ov::element::Type& dst_prc, const int32_t ptr_offset) { switch (src_prc) { - case ov::element::f16: { - ldr(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::f32: - case ov::element::i32: { - ldr(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i8: { - ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + case ov::element::f16: { + ldr(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::f32: + case ov::element::i32: { + ldr(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i8: { + ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - // scalar is loaded, operates with vector - TReg vec(data.getIdx()); - sshll(vec.h8, vec.b8, 0); - sshll(vec.s4, vec.h4, 0); - break; - } - case ov::element::u8: { - ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + // scalar is loaded, operates with vector + TReg vec(data.getIdx()); + sshll(vec.h8, vec.b8, 0); + sshll(vec.s4, vec.h4, 0); + break; + } + case ov::element::u8: { + ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - // scalar is loaded, operates with vector - TReg vec(data.getIdx()); - ushll(vec.h8, vec.b8, 0); - ushll(vec.s4, vec.h4, 0); - break; - } - default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); - } + // scalar is loaded, operates with vector + TReg vec(data.getIdx()); + ushll(vec.h8, vec.b8, 0); + ushll(vec.s4, vec.h4, 0); + break; + } + default: { + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + } } if (dst_prc != src_prc) { switch (dst_prc) { - case ov::element::f32: - switch (src_prc) { - case ov::element::f16: { - fcvt(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::HReg(data.getIdx())); - break; - } - case ov::element::i32: - case ov::element::i8: { - scvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); - break; - } - case ov::element::u8: { - ucvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); - break; - } - default: - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); - } + case ov::element::f32: + switch (src_prc) { + case ov::element::f16: { + fcvt(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::HReg(data.getIdx())); + break; + } + case ov::element::i32: + case ov::element::i8: { + scvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); + break; + } + case ov::element::u8: { + ucvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); break; + } default: - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + + dst_prc.to_string()); + } + break; + default: + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); } } } @@ -440,58 +459,59 @@ void jit_uni_eltwise_generic::store_vector(const XReg& ptr, const int32_t ptr_offset) { if (src_prc != dst_prc) { switch (src_prc) { - case ov::element::f32: { - switch (dst_prc) { - case ov::element::f16: { - fcvtn(data.h4, data.s4); - break; - } - case ov::element::i32: { - fcvtns(data.s, data.s); - break; - } - case ov::element::i8: { - fcvtms(data.s, data.s); - xtn(data.h4, data.s4); - xtn(data.b8, data.h8); - break; - } - case ov::element::u8: { - fcvtmu(data.s, data.s); - xtn(data.h4, data.s4); - xtn(data.b8, data.h8); - break; - } - default: { - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); - } - } + case ov::element::f32: { + switch (dst_prc) { + case ov::element::f16: { + fcvtn(data.h4, data.s4); + break; + } + case ov::element::i32: { + fcvtns(data.s, data.s); + break; + } + case ov::element::i8: { + fcvtms(data.s, data.s); + xtn(data.h4, data.s4); + xtn(data.b8, data.h8); + break; + } + case ov::element::u8: { + fcvtmu(data.s, data.s); + xtn(data.h4, data.s4); + xtn(data.b8, data.h8); break; } default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + + src_prc.to_string()); + } } - } - } - - switch (dst_prc) { - case ov::element::f16: { - str(Xbyak_aarch64::DReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::f32: - case ov::element::i32: { - str(Xbyak_aarch64::QReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i8: - case ov::element::u8: { - str(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); break; } default: { - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_ptr is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); } + } + } + + switch (dst_prc) { + case ov::element::f16: { + str(Xbyak_aarch64::DReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::f32: + case ov::element::i32: { + str(Xbyak_aarch64::QReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i8: + case ov::element::u8: { + str(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + default: { + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_ptr is " + src_prc.to_string()); + } } } @@ -503,99 +523,94 @@ void jit_uni_eltwise_generic::store_scalar(const XReg& ptr, const int32_t ptr_offset) { if (src_prc != dst_prc) { switch (src_prc) { - case ov::element::f32: { - switch (dst_prc) { - case ov::element::f16: { - fcvt(Xbyak_aarch64::HReg(data.getIdx()), data); - break; - } - case ov::element::i32: { - fcvtns(data, data); - break; - } - case ov::element::i8: { - TReg vec_data(data.getIdx()); - fcvtms(vec_data.s, vec_data.s); - xtn(vec_data.h4, vec_data.s4); - xtn(vec_data.b8, vec_data.h8); - break; - } - case ov::element::u8: { - TReg vec_data(data.getIdx()); - fcvtmu(vec_data.s, vec_data.s); - xtn(vec_data.h4, vec_data.s4); - xtn(vec_data.b8, vec_data.h8); - break; - } - default: { - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); - } - } + case ov::element::f32: { + switch (dst_prc) { + case ov::element::f16: { + fcvt(Xbyak_aarch64::HReg(data.getIdx()), data); + break; + } + case ov::element::i32: { + fcvtns(data, data); + break; + } + case ov::element::i8: { + TReg vec_data(data.getIdx()); + fcvtms(vec_data.s, vec_data.s); + xtn(vec_data.h4, vec_data.s4); + xtn(vec_data.b8, vec_data.h8); + break; + } + case ov::element::u8: { + TReg vec_data(data.getIdx()); + fcvtmu(vec_data.s, vec_data.s); + xtn(vec_data.h4, vec_data.s4); + xtn(vec_data.b8, vec_data.h8); break; } default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + + src_prc.to_string()); + } } - } - } - - switch (dst_prc) { - case ov::element::f16: { - str(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i32: - case ov::element::f32: { - str(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i8: - case ov::element::u8: { - str(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); break; } default: { - OPENVINO_THROW("dst_prc " + src_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); } + } + } + + switch (dst_prc) { + case ov::element::f16: { + str(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i32: + case ov::element::f32: { + str(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i8: + case ov::element::u8: { + str(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + default: { + OPENVINO_THROW("dst_prc " + src_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + } } } struct EltwiseEmitterContext { std::shared_ptr emitter; - dnnl::impl::cpu::aarch64::jit_generator *host; + dnnl::impl::cpu::aarch64::jit_generator* host; dnnl::impl::cpu::aarch64::cpu_isa_t host_isa; const EltwiseData& opData; ov::element::Type exec_prc; }; -template +template struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { - ctx.emitter = std::make_shared(ctx.host, - ctx.host_isa, - ctx.opData.alpha, - ctx.exec_prc); + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.opData.alpha, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { - ctx.emitter = std::make_shared(ctx.host, - ctx.host_isa, - ctx.opData.alpha, - ctx.exec_prc); + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.opData.alpha, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, @@ -606,7 +621,7 @@ struct EltwiseEmitter { } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, @@ -618,7 +633,7 @@ struct EltwiseEmitter { } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, @@ -630,53 +645,56 @@ struct EltwiseEmitter { }; template -std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitter(const EltwiseData& data, const ov::element::Type& exec_prec) { - EltwiseEmitterContext ctx = { - nullptr, - this, - isa, - data, - exec_prec - }; - - OV_SWITCH(intel_cpu, EltwiseEmitter, ctx, data.algo, - OV_CASE(Algorithm::EltwiseAbs, ov::intel_cpu::aarch64::jit_abs_emitter), - OV_CASE(Algorithm::EltwiseAdd, ov::intel_cpu::aarch64::jit_add_emitter), - OV_CASE(Algorithm::EltwiseClamp, ov::intel_cpu::aarch64::jit_clamp_emitter), - OV_CASE(Algorithm::EltwiseDivide, ov::intel_cpu::aarch64::jit_divide_emitter), - OV_CASE(Algorithm::EltwiseElu, ov::intel_cpu::aarch64::jit_elu_emitter), - OV_CASE(Algorithm::EltwiseEqual, ov::intel_cpu::aarch64::jit_equal_emitter), - OV_CASE(Algorithm::EltwiseExp, ov::intel_cpu::aarch64::jit_exp_emitter), - OV_CASE(Algorithm::EltwiseFloor, ov::intel_cpu::aarch64::jit_floor_emitter), - OV_CASE(Algorithm::EltwiseCeiling, ov::intel_cpu::aarch64::jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseHswish, ov::intel_cpu::aarch64::jit_hswish_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, ov::intel_cpu::aarch64::jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, ov::intel_cpu::aarch64::jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, ov::intel_cpu::aarch64::jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, ov::intel_cpu::aarch64::jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, ov::intel_cpu::aarch64::jit_logical_not_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, ov::intel_cpu::aarch64::jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, ov::intel_cpu::aarch64::jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseMaximum, ov::intel_cpu::aarch64::jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, ov::intel_cpu::aarch64::jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseMish, ov::intel_cpu::aarch64::jit_mish_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, ov::intel_cpu::aarch64::jit_gelu_erf_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, ov::intel_cpu::aarch64::jit_gelu_tanh_emitter), - OV_CASE(Algorithm::EltwiseGreater, ov::intel_cpu::aarch64::jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, ov::intel_cpu::aarch64::jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseMod, ov::intel_cpu::aarch64::jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMultiply, ov::intel_cpu::aarch64::jit_multiply_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_static_emitter), - OV_CASE(Algorithm::EltwisePrelu, ov::intel_cpu::aarch64::jit_prelu_emitter), - OV_CASE(Algorithm::EltwiseRelu, ov::intel_cpu::aarch64::jit_relu_emitter), - OV_CASE(Algorithm::EltwiseSelect, ov::intel_cpu::aarch64::jit_select_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter), - OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter), - OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter)); +std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitter(const EltwiseData& data, + const ov::element::Type& exec_prec) { + EltwiseEmitterContext ctx = {nullptr, this, isa, data, exec_prec}; + + OV_SWITCH( + intel_cpu, + EltwiseEmitter, + ctx, + data.algo, + OV_CASE(Algorithm::EltwiseAbs, ov::intel_cpu::aarch64::jit_abs_emitter), + OV_CASE(Algorithm::EltwiseAdd, ov::intel_cpu::aarch64::jit_add_emitter), + OV_CASE(Algorithm::EltwiseClamp, ov::intel_cpu::aarch64::jit_clamp_emitter), + OV_CASE(Algorithm::EltwiseDivide, ov::intel_cpu::aarch64::jit_divide_emitter), + OV_CASE(Algorithm::EltwiseElu, ov::intel_cpu::aarch64::jit_elu_emitter), + OV_CASE(Algorithm::EltwiseEqual, ov::intel_cpu::aarch64::jit_equal_emitter), + OV_CASE(Algorithm::EltwiseExp, ov::intel_cpu::aarch64::jit_exp_emitter), + OV_CASE(Algorithm::EltwiseFloor, ov::intel_cpu::aarch64::jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, ov::intel_cpu::aarch64::jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseCeiling, ov::intel_cpu::aarch64::jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseHswish, ov::intel_cpu::aarch64::jit_hswish_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, ov::intel_cpu::aarch64::jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, ov::intel_cpu::aarch64::jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, ov::intel_cpu::aarch64::jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, ov::intel_cpu::aarch64::jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, ov::intel_cpu::aarch64::jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, ov::intel_cpu::aarch64::jit_logical_not_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, ov::intel_cpu::aarch64::jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, ov::intel_cpu::aarch64::jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseMaximum, ov::intel_cpu::aarch64::jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, ov::intel_cpu::aarch64::jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseMish, ov::intel_cpu::aarch64::jit_mish_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, ov::intel_cpu::aarch64::jit_gelu_erf_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, ov::intel_cpu::aarch64::jit_gelu_tanh_emitter), + OV_CASE(Algorithm::EltwiseGreater, ov::intel_cpu::aarch64::jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, ov::intel_cpu::aarch64::jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseMod, ov::intel_cpu::aarch64::jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMultiply, ov::intel_cpu::aarch64::jit_multiply_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_static_emitter), + OV_CASE(Algorithm::EltwisePrelu, ov::intel_cpu::aarch64::jit_prelu_emitter), + OV_CASE(Algorithm::EltwiseRelu, ov::intel_cpu::aarch64::jit_relu_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, ov::intel_cpu::aarch64::jit_round_half_away_from_zero_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, ov::intel_cpu::aarch64::jit_round_half_to_even_emitter), + OV_CASE(Algorithm::EltwiseSelect, ov::intel_cpu::aarch64::jit_select_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter), + OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter), + OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter)); if (!ctx.emitter) OPENVINO_THROW("Unsupported operation type '" + algToString(data.algo) + "' for Eltwise emitter"); @@ -742,16 +760,16 @@ void jit_uni_eltwise_generic::apply_post_ops() { namespace { -template +template struct SupportedPrecisions { - void operator()(std::set> &precisions) { + void operator()(std::set>& precisions) { precisions = T::get_supported_precisions(); } }; static void set_intersection(const std::set>& precisions1, - const std::set>& precisions2, - std::set>& intersection) { + const std::set>& precisions2, + std::set>& intersection) { std::map intersection_types; for (auto it1 = precisions1.begin(); it1 != precisions1.end(); ++it1) { @@ -769,7 +787,7 @@ static void set_intersection(const std::set>& precisi intersection.insert(std::vector(it->second, it->first)); } } -} // namespace +} // namespace ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_number, const ov::element::Type (&src_prc)[MAX_ELTWISE_INPUTS], @@ -788,16 +806,14 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu supported_precision_intersection = prcs_intersect; } - static const element::Type exec_precisions_priority[] = { - element::f16, - element::f32 - }; + static const element::Type exec_precisions_priority[] = {element::f16, element::f32}; for (const auto prc : exec_precisions_priority) { - if (std::any_of( - supported_precision_intersection.begin(), - supported_precision_intersection.end(), - [&prc](const std::vector& precisions) { return std::find(precisions.begin(), precisions.end(), prc) != precisions.end(); })) { + if (std::any_of(supported_precision_intersection.begin(), + supported_precision_intersection.end(), + [&prc](const std::vector& precisions) { + return std::find(precisions.begin(), precisions.end(), prc) != precisions.end(); + })) { exec_prc = prc; break; } @@ -820,44 +836,51 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu std::set> eltwise_precision_helper::get_supported_precisions(const Algorithm& algo) { std::set> precisions; - OV_SWITCH(intel_cpu, SupportedPrecisions, precisions, algo, - OV_CASE(Algorithm::EltwiseRelu, jit_relu_emitter), - OV_CASE(Algorithm::EltwiseAbs, jit_abs_emitter), - OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), - OV_CASE(Algorithm::EltwiseClamp, jit_clamp_emitter), - OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), - OV_CASE(Algorithm::EltwiseElu, jit_elu_emitter), - OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), - OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), - OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), - OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), - OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseHswish, jit_hswish_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseMish, jit_mish_emitter), - OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), - OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), - OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), - OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter), - OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter)); + OV_SWITCH(intel_cpu, + SupportedPrecisions, + precisions, + algo, + OV_CASE(Algorithm::EltwiseRelu, jit_relu_emitter), + OV_CASE(Algorithm::EltwiseAbs, jit_abs_emitter), + OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), + OV_CASE(Algorithm::EltwiseClamp, jit_clamp_emitter), + OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), + OV_CASE(Algorithm::EltwiseElu, jit_elu_emitter), + OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), + OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), + OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), + OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseHswish, jit_hswish_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseMish, jit_mish_emitter), + OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), + OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_round_half_away_from_zero_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_round_half_to_even_emitter), + OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), + OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter), + OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter)); if (precisions.empty()) OPENVINO_THROW("Unsupported operation type for Eltwise emitter"); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp index 8f18a9815b4fe4..1bf64d096e4a84 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp @@ -4,10 +4,11 @@ #pragma once -#include - #include + #include +#include + #include "nodes/executors/eltwise.hpp" // TODO: handle x64 headers more accurate and remove undef later @@ -24,12 +25,11 @@ #include -#include "utils/general_utils.h" -#include "utils/cpu_utils.hpp" - -#include "emitters/plugin/aarch64/jit_emitter.hpp" #include "emitters/plugin/aarch64/jit_eltwise_emitters.hpp" +#include "emitters/plugin/aarch64/jit_emitter.hpp" #include "nodes/kernels/jit_eltwise_call_args_ptrs.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { @@ -154,7 +154,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { OPENVINO_THROW("source vector ptr register " + std::to_string(idx) + " is not supported"); } - static const std::vector src_gprs = { 19, 20, 21, 22, 25, 26, 27 }; + static const std::vector src_gprs = {19, 20, 21, 22, 25, 26, 27}; return XReg(src_gprs[idx]); } @@ -192,8 +192,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { // 24 | src // 25-31 | [not used] - - TReg vmm_dst {9}; + TReg vmm_dst{9}; inline TReg get_vmm_reg(const uint32_t idx) { if (idx > MAX_ELTWISE_INPUTS) { @@ -230,10 +229,10 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { const int32_t ptr_offset = 0); void store_vector(const XReg& ptr, - const TReg& data, - const ov::element::Type& src_prc, - const ov::element::Type& dst_prc, - const int32_t ptr_offset = 0); + const TReg& data, + const ov::element::Type& src_prc, + const ov::element::Type& dst_prc, + const int32_t ptr_offset = 0); void store_scalar(const XReg& ptr, const SReg& data, @@ -264,6 +263,6 @@ class eltwise_precision_helper { static std::set> get_supported_precisions(const Algorithm& algo); }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp index 0ae6a4ad2c45ff..28e17854f46b08 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp @@ -6,102 +6,95 @@ namespace ov { namespace intel_cpu { - GemmKernel::GemmKernel(size_t M, - size_t N, - size_t K, - bool b_transposed, - ov::element::Type inType) +GemmKernel::GemmKernel(size_t M, size_t N, size_t K, bool b_transposed, ov::element::Type inType) : M(M), N(N), K(K), b_transposed(b_transposed) { - if (!one_of(inType, ov::element::f32, ov::element::f16, ov::element::bf16)) - THROW_ERROR("brgemm kernel only supports bf16, f16 and f32"); - - if (inType == ov::element::f32) { - format = arm_compute::Format::F32; - } else if (inType == ov::element::f16) { - format = arm_compute::Format::F16; - } else if (inType == ov::element::bf16) { - format = arm_compute::Format::BFLOAT16; - } - + if (!one_of(inType, ov::element::f32, ov::element::f16, ov::element::bf16)) + THROW_ERROR("brgemm kernel only supports bf16, f16 and f32"); + + if (inType == ov::element::f32) { + format = arm_compute::Format::F32; + } else if (inType == ov::element::f16) { + format = arm_compute::Format::F16; + } else if (inType == ov::element::bf16) { + format = arm_compute::Format::BFLOAT16; + } - aclGemmKernel = std::make_unique(); + aclGemmKernel = std::make_unique(); +} + +arm_compute::Status GemmKernel::executeGemm(void* a, + void* b, + arm_compute::TensorInfo& dstInfo, + arm_compute::Tensor& dstTensor, + arm_compute::Strides aStrides, + arm_compute::Strides bStrides, + void* c, + float alpha, + float beta, + arm_compute::Strides* outStrides, + void* out) { + aInfo.init(shapeCast({M, N}), + format, + aStrides, + size_t(0), + (size_t)(M * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); + + arm_compute::TensorShape bShape; + if (b_transposed) + bShape = shapeCast({K, N}); + else + bShape = shapeCast({N, K}); + + bInfo.init(bShape, + format, + bStrides, + size_t(0), + (size_t)(K * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); + + aTensor.allocator()->init(aInfo); + bTensor.allocator()->init(bInfo); + + if (c != nullptr) { + cInfo.init(shapeCast({M, K}), format); + cTensor.allocator()->init(cInfo); } - arm_compute::Status GemmKernel::executeGemm(void *a, - void *b, - arm_compute::TensorInfo& dstInfo, - arm_compute::Tensor& dstTensor, - arm_compute::Strides aStrides, - arm_compute::Strides bStrides, - void *c, - float alpha, - float beta, - arm_compute::Strides* outStrides, - void* out) { - aInfo.init( - shapeCast({M, N}), + if (outStrides != nullptr) + dstInfo.init( + shapeCast({M, K}), format, - aStrides, + *outStrides, size_t(0), - (size_t)(M * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); - - arm_compute::TensorShape bShape; - if (b_transposed) - bShape = shapeCast({K, N}); - else - bShape = shapeCast({N, K}); - - bInfo.init( - bShape, - format, - bStrides, - size_t(0), - (size_t)(K * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); - - aTensor.allocator()->init(aInfo); - bTensor.allocator()->init(bInfo); - - if (c != nullptr) { - cInfo.init(shapeCast({M, K}), format); - cTensor.allocator()->init(cInfo); - } - - if (outStrides != nullptr) - dstInfo.init( - shapeCast({M, K}), - format, - *outStrides, - size_t(0), - (size_t)(M * K * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); - else - dstInfo.init(shapeCast({M, K}), format); - - dstTensor.allocator()->init(dstInfo); - - aTensor.allocator()->import_memory(reinterpret_cast(a)); - bTensor.allocator()->import_memory(reinterpret_cast(b)); - cTensor.allocator()->import_memory(reinterpret_cast(c)); - - if (out == nullptr) - dstTensor.allocator()->allocate(); - else - dstTensor.allocator()->import_memory(out); - - if (b_transposed) - aclGemmInfo.set_pretranspose_B(true); - - auto status = aclGemmKernel->validate(&aInfo, &bInfo, &cInfo, &dstInfo, 1.0, 0.0, aclGemmInfo); - - if (c == nullptr) - aclGemmKernel->configure(&aTensor, &bTensor, nullptr, &dstTensor, alpha, beta, aclGemmInfo); - else - aclGemmKernel->configure(&aTensor, &bTensor, &cTensor, &dstTensor, alpha, beta, aclGemmInfo); - aclGemmKernel->run(); - - return status; - } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file + (size_t)(M * K * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); + else + dstInfo.init(shapeCast({M, K}), format); + + dstTensor.allocator()->init(dstInfo); + + aTensor.allocator()->import_memory(reinterpret_cast(a)); + bTensor.allocator()->import_memory(reinterpret_cast(b)); + cTensor.allocator()->import_memory(reinterpret_cast(c)); + + if (out == nullptr) + dstTensor.allocator()->allocate(); + else + dstTensor.allocator()->import_memory(out); + + if (b_transposed) + aclGemmInfo.set_pretranspose_B(true); + + auto status = aclGemmKernel->validate(&aInfo, &bInfo, &cInfo, &dstInfo, 1.0, 0.0, aclGemmInfo); + + if (c == nullptr) + aclGemmKernel->configure(&aTensor, &bTensor, nullptr, &dstTensor, alpha, beta, aclGemmInfo); + else + aclGemmKernel->configure(&aTensor, &bTensor, &cTensor, &dstTensor, alpha, beta, aclGemmInfo); + aclGemmKernel->run(); + + return status; +} +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp index 620f42f239cbbb..06a26743e0b2a4 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp @@ -4,21 +4,17 @@ #pragma once #include #include -#include "nodes/executors/acl/acl_utils.hpp" -#include "utils/general_utils.h" -#include "arm_compute/runtime/NEON/NEFunctions.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/acl/acl_utils.hpp" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { class GemmKernel { public: - GemmKernel(size_t M, - size_t N, - size_t K, - bool b_transposed = false, - ov::element::Type inType = ov::element::f32); + GemmKernel(size_t M, size_t N, size_t K, bool b_transposed = false, ov::element::Type inType = ov::element::f32); arm_compute::Status executeGemm(void* a, void* b, @@ -48,5 +44,5 @@ class GemmKernel { arm_compute::GEMMInfo aclGemmInfo; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp b/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp index 7370bb824d8c62..66f119ee839b14 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp @@ -9,21 +9,21 @@ namespace ov { namespace intel_cpu { namespace node { -#define MAX_ELTWISE_INPUTS 7 +#define MAX_ELTWISE_INPUTS 7 #define MAX_ELTWISE_DIM_RANK 12 struct jit_eltwise_call_args_ptrs { - const void *src_ptr[MAX_ELTWISE_INPUTS]; - void *dst_ptr; - //ptr to array of post op inputs pointers (flat list) + const void* src_ptr[MAX_ELTWISE_INPUTS]; + void* dst_ptr; + // ptr to array of post op inputs pointers (flat list) const void** post_op_data; // shape agnostic kernel size_t work_amount; - const void *src_offsets[MAX_ELTWISE_INPUTS]; - const void *dst_offsets; + const void* src_offsets[MAX_ELTWISE_INPUTS]; + const void* dst_offsets; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace node +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp index 755330bd850c4d..b4d38086cefe8a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp @@ -13,10 +13,10 @@ # include #endif -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/parallel.hpp" -#include "common.hpp" #include "attn_memcpy.hpp" +#include "common.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" namespace ov { namespace Extensions { @@ -26,7 +26,7 @@ namespace XARCH { using namespace ov; // float16 <- float -template +template void attn_copy(TA* a, TB* b, size_t n) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -51,14 +51,11 @@ void attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output) { // For compatibility, all input_kvs are permuted to BHLS - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) { - attn_copy(past_k_output.ptr(b, h, m, 0), - k_input.ptr(b, h, m, 0), - S); - attn_copy(past_v_output.ptr(b, h, m, 0), - v_input.ptr(b, h, m, 0), - SV); + attn_copy(past_k_output.ptr(b, h, m, 0), k_input.ptr(b, h, m, 0), S); + attn_copy(past_v_output.ptr(b, h, m, 0), v_input.ptr(b, h, m, 0), SV); }); } @@ -67,14 +64,11 @@ static void attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output) { // For compatibility, all input_kvs are permuted to BHLS - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) { - std::memcpy(past_k_output.ptr_v(b, h, m, 0), - k_input.ptr_v(b, h, m, 0), - S * k_input.m_element_size); - std::memcpy(past_v_output.ptr_v(b, h, m, 0), - v_input.ptr_v(b, h, m, 0), - SV * v_input.m_element_size); + std::memcpy(past_k_output.ptr_v(b, h, m, 0), k_input.ptr_v(b, h, m, 0), S * k_input.m_element_size); + std::memcpy(past_v_output.ptr_v(b, h, m, 0), v_input.ptr_v(b, h, m, 0), SV * v_input.m_element_size); }); } @@ -84,19 +78,17 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output, const ov::intel_cpu::PlainTensor& slot_mapping) { - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; size_t block_size = past_k_output.m_dims[2]; parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) { auto slot = slot_mapping.ptr(b)[m]; - if (slot < 0) return; + if (slot < 0) + return; auto block_number = slot / block_size; auto block_offset = slot % block_size; - attn_copy(past_k_output.ptr(block_number, h, block_offset, 0), - k_input.ptr(b, h, m, 0), - S); - attn_copy(past_v_output.ptr(block_number, h, block_offset, 0), - v_input.ptr(b, h, m, 0), - SV); + attn_copy(past_k_output.ptr(block_number, h, block_offset, 0), k_input.ptr(b, h, m, 0), S); + attn_copy(past_v_output.ptr(block_number, h, block_offset, 0), v_input.ptr(b, h, m, 0), SV); }); } @@ -105,11 +97,13 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output, const ov::intel_cpu::PlainTensor& slot_mapping) { - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; size_t block_size = past_k_output.m_dims[2]; parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) { auto slot = slot_mapping.ptr(b)[m]; - if (slot < 0) return; + if (slot < 0) + return; auto block_number = slot / block_size; auto block_offset = slot % block_size; std::memcpy(past_k_output.ptr_v(block_number, h, block_offset, 0), @@ -132,7 +126,11 @@ void attn_memcpy(const ov::intel_cpu::PlainTensor& k_input, } else if (k_input.get_precision() == ov::element::f32 && past_k_output.get_precision() == ov::element::bf16) { attn_memcpy_kernel(k_input, v_input, past_k_output, past_v_output); } else { - OPENVINO_THROW("unsupport src type: ", k_input.get_precision(), ", dst type: ", past_k_output.get_precision(), " in attn_memcpy"); + OPENVINO_THROW("unsupport src type: ", + k_input.get_precision(), + ", dst type: ", + past_k_output.get_precision(), + " in attn_memcpy"); } } @@ -148,7 +146,11 @@ void paged_attn_memcpy(const ov::intel_cpu::PlainTensor& k_input, } else if (k_input.get_precision() == ov::element::f32 && past_k_output.get_precision() == ov::element::bf16) { paged_attn_memcpy_kernel(k_input, v_input, past_k_output, past_v_output, slot_mapping); } else { - OPENVINO_THROW("unsupport src type: ", k_input.get_precision(), ", dst type: ", past_k_output.get_precision(), " in paged_attn_memcpy"); + OPENVINO_THROW("unsupport src type: ", + k_input.get_precision(), + ", dst type: ", + past_k_output.get_precision(), + " in paged_attn_memcpy"); } } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp index c0e5892db9926b..ea704232e333bd 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp @@ -7,6 +7,7 @@ #include #include #include + #include "openvino/core/type/element_type.hpp" #include "utils/plain_tensor.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp index 66772bda03db51..095180d659142e 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp @@ -13,11 +13,11 @@ # include #endif -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/parallel.hpp" -#include "common.hpp" #include "attn_quant.hpp" #include "attn_quant_kernel.hpp" +#include "common.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" namespace ov { namespace Extensions { @@ -26,7 +26,7 @@ namespace XARCH { using namespace ov; -template +template static void quant_u8(const T* src, uint8_t* dst, size_t n, float& scale, float& zp) { size_t i = 0; float max = -FLT_MAX; @@ -182,16 +182,8 @@ static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src, parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) { auto p_k = k_scale_zp.ptr(m, b, h); auto p_v = v_scale_zp.ptr(m, b, h); - quant_u8(k_src.ptr(b, h, m), - k_dst.ptr(b, h, m), - S, - p_k[0], - p_k[1]); - quant_u8(v_src.ptr(b, h, m), - v_dst.ptr(b, h, m), - SV, - p_v[0], - p_v[1]); + quant_u8(k_src.ptr(b, h, m), k_dst.ptr(b, h, m), S, p_k[0], p_k[1]); + quant_u8(v_src.ptr(b, h, m), v_dst.ptr(b, h, m), SV, p_v[0], p_v[1]); }); } @@ -205,14 +197,16 @@ static void paged_attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src, size_t block_size = k_dst.m_dims[2]; parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) { auto slot = slot_mapping.ptr(b)[m]; - if (slot < 0) return; + if (slot < 0) + return; auto block_number = slot / block_size; auto block_offset = slot % block_size; auto p_k = reinterpret_cast(k_dst.ptr(block_number, h, block_offset)); auto p_v = reinterpret_cast(v_dst.ptr(block_number, h, block_offset)); // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| quant_u8(k_src.ptr(b, h, m), k_dst.ptr(block_number, h, block_offset) + sizeof(float) + sizeof(float), S, @@ -239,7 +233,11 @@ void attn_quantkv(const ov::intel_cpu::PlainTensor& k_src, } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) { attn_quant_mt(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp); } else { - OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in attn_quantkv"); + OPENVINO_THROW("unsupport src type: ", + k_src.get_precision(), + ", dst type: ", + k_dst.get_precision(), + " in attn_quantkv"); } } @@ -255,7 +253,11 @@ void paged_attn_quantkv(const ov::intel_cpu::PlainTensor& k_src, } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) { paged_attn_quant_mt(k_src, v_src, k_dst, v_dst, slot_mapping); } else { - OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in paged_attn_quantkv"); + OPENVINO_THROW("unsupport src type: ", + k_src.get_precision(), + ", dst type: ", + k_dst.get_precision(), + " in paged_attn_quantkv"); } } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp index ca930a1055db2b..2f39f74f5b3460 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp @@ -7,6 +7,7 @@ #include #include #include + #include "openvino/core/type/element_type.hpp" #include "utils/plain_tensor.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp index 4e013a004d29f9..759d0005103871 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp @@ -3,19 +3,21 @@ // #pragma once -#include +#include "nodes/kernels/scaled_attn/common.hpp" + +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# include +#endif + #include #include -#include -#include "openvino/core/type/element_type.hpp" -#include "utils/plain_tensor.hpp" namespace ov { namespace Extensions { namespace Cpu { namespace XARCH { -template +template void attn_dequant_u8_kernel(const uint8_t* src, TDST* dst, size_t n, float scale, float zp) { size_t i = 0; // loadu_si128/epi64 does not support const qualifier @@ -53,4 +55,4 @@ void attn_dequant_u8_kernel(const uint8_t* src, TDST* dst, size_t n, float scale } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp index 2956c8a6a6b5b8..63cbbb4464ee92 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp @@ -4,16 +4,23 @@ #pragma once #include +#include #include #include #include -#include #include "openvino/core/type/bfloat16.hpp" #include "openvino/core/type/float16.hpp" +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# include +#endif + #if defined(OPENVINO_ARCH_ARM64) -#include "arm_neon.h" +# if defined(HAVE_SVE) +# include "arm_sve.h" +# endif +# include "arm_neon.h" #endif namespace ov { @@ -31,308 +38,385 @@ static constexpr size_t vec_len_f32_avx2 = vec_len_avx2 / sizeof(float); static constexpr size_t vec_len_f32_neon = vec_len_neon / sizeof(float); static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); +#if defined(HAVE_SVE) +static constexpr size_t vec_len_f32_sve = svcntw(); +#endif + #ifdef HAVE_AVX512F - inline __m512 cvt_bf16_to_fp32(const __m256i src) { - __m512i y = _mm512_cvtepu16_epi32(src); - return _mm512_castsi512_ps(_mm512_slli_epi32(y, 16)); - } - - // load addr to __m512 reg - inline __m512 mm512_uni_loadu_ps(const float* a) { - return _mm512_loadu_ps(a); - } - - inline __m512 mm512_uni_loadu_ps(const ov::bfloat16* a) { - auto vec_bf16 = _mm256_loadu_si256(reinterpret_cast(a)); - return cvt_bf16_to_fp32(vec_bf16); - } - - inline __m512 mm512_uni_loadu_ps(const ov::float16* a) { - auto vec_f16 = _mm256_loadu_si256(reinterpret_cast(a)); - return _mm512_cvtph_ps(vec_f16); - } - - // load addr to __m512 reg - inline __m512 mm512_uni_loadu_tail_ps(const float* a, size_t count) { - __mmask16 mask = (1 << count) - 1; - return _mm512_maskz_loadu_ps(mask, a); - } - - inline __m512 mm512_uni_loadu_tail_ps(const ov::bfloat16* a, size_t count) { - auto mask = (1 << count) - 1; - auto bf16_vec = _mm256_maskz_loadu_epi16(mask, a); - return cvt_bf16_to_fp32(bf16_vec); - } - - inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) { - auto mask = (1 << count) - 1; - auto f16_vec = _mm256_maskz_loadu_epi16(mask, a); - return _mm512_cvtph_ps(f16_vec); - } - - // store __m512 reg to addr - inline void mm512_uni_storeu_ps(float* a, __m512 v) { - _mm512_storeu_ps(a, v); - } - inline void mm512_uni_storeu_ps(ov::bfloat16 *addr, __m512 xps) { - __m512i xpi32 = _mm512_castps_si512(xps); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 - _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), _mm512_cvtepi32_epi16(x)); - } - - inline void mm512_uni_storeu_ps(ov::float16* addr, __m512 v) { - __m256i vec_f16 = _mm512_cvtps_ph(v, 0); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16); - } - - // store __m512 reg to addr - inline void mm512_uni_mask_storeu_ps(ov::bfloat16 *addr, __mmask16 mask_addr, __m512 xps) { - __m512i xpi32 = _mm512_castps_si512(xps); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 - _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); - } - - inline void mm512_uni_storeu_tail_ps(float *addr, __m512 v, size_t count) { - __mmask16 mask_addr = (1 << count) - 1; - _mm512_mask_storeu_ps(addr, mask_addr, v); - } - - inline void mm512_uni_storeu_tail_ps(ov::bfloat16 *addr, __m512 v, size_t count) { - __mmask16 mask_addr = (1 << count) - 1; - __m512i xpi32 = _mm512_castps_si512(v); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask = _mm512_cmp_ps_mask(v, v, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 - _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); - } - - inline void mm512_uni_storeu_tail_ps(ov::float16 *addr, __m512 v, size_t count) { - __mmask16 mask_addr = (1 << count) - 1; - __m256i vec_f16 = _mm512_cvtps_ph(v, 0); - _mm256_mask_storeu_epi16(reinterpret_cast<__m256i *>(addr), mask_addr, vec_f16); - } +inline __m512 cvt_bf16_to_fp32(const __m256i src) { + __m512i y = _mm512_cvtepu16_epi32(src); + return _mm512_castsi512_ps(_mm512_slli_epi32(y, 16)); +} + +// load addr to __m512 reg +inline __m512 mm512_uni_loadu_ps(const float* a) { + return _mm512_loadu_ps(a); +} + +inline __m512 mm512_uni_loadu_ps(const ov::bfloat16* a) { + auto vec_bf16 = _mm256_loadu_si256(reinterpret_cast(a)); + return cvt_bf16_to_fp32(vec_bf16); +} + +inline __m512 mm512_uni_loadu_ps(const ov::float16* a) { + auto vec_f16 = _mm256_loadu_si256(reinterpret_cast(a)); + return _mm512_cvtph_ps(vec_f16); +} + +// load addr to __m512 reg +inline __m512 mm512_uni_loadu_tail_ps(const float* a, size_t count) { + __mmask16 mask = (1 << count) - 1; + return _mm512_maskz_loadu_ps(mask, a); +} + +inline __m512 mm512_uni_loadu_tail_ps(const ov::bfloat16* a, size_t count) { + auto mask = (1 << count) - 1; + auto bf16_vec = _mm256_maskz_loadu_epi16(mask, a); + return cvt_bf16_to_fp32(bf16_vec); +} + +inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) { + auto mask = (1 << count) - 1; + auto f16_vec = _mm256_maskz_loadu_epi16(mask, a); + return _mm512_cvtph_ps(f16_vec); +} + +// store __m512 reg to addr +inline void mm512_uni_storeu_ps(float* a, __m512 v) { + _mm512_storeu_ps(a, v); +} +inline void mm512_uni_storeu_ps(ov::bfloat16* addr, __m512 xps) { + __m512i xpi32 = _mm512_castps_si512(xps); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 + _mm256_storeu_si256(reinterpret_cast<__m256i*>(addr), _mm512_cvtepi32_epi16(x)); +} + +inline void mm512_uni_storeu_ps(ov::float16* addr, __m512 v) { + __m256i vec_f16 = _mm512_cvtps_ph(v, 0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(addr), vec_f16); +} + +// store __m512 reg to addr +inline void mm512_uni_mask_storeu_ps(ov::bfloat16* addr, __mmask16 mask_addr, __m512 xps) { + __m512i xpi32 = _mm512_castps_si512(xps); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 + _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); +} + +inline void mm512_uni_storeu_tail_ps(float* addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + _mm512_mask_storeu_ps(addr, mask_addr, v); +} + +inline void mm512_uni_storeu_tail_ps(ov::bfloat16* addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + __m512i xpi32 = _mm512_castps_si512(v); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask = _mm512_cmp_ps_mask(v, v, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 + _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); +} + +inline void mm512_uni_storeu_tail_ps(ov::float16* addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + __m256i vec_f16 = _mm512_cvtps_ph(v, 0); + _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(addr), mask_addr, vec_f16); +} #endif #ifdef HAVE_AVX2 - inline __m256i get_mask(int N7) { - static __m256i mask[] = { - _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0), - _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1), - _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1), - _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1), - _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1), - _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1), - _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), - _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1), - _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1), - }; - return _mm256_loadu_si256(&mask[N7]); - } - - // load addr to __m256 reg - inline __m256 mm256_uni_loadu_ps(const float* a) { - return _mm256_loadu_ps(a); - } - - inline __m256 mm256_uni_loadu_ps(const ov::bfloat16* a) { - auto vec_bf16 = _mm_loadu_si128(reinterpret_cast(a)); - auto o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(vec_bf16), 16)); - return o; - } - - inline __m256 mm256_uni_loadu_ps(const ov::float16* a) { - auto vec_f16 = _mm_loadu_si128(reinterpret_cast(a)); - auto o = _mm256_cvtph_ps(vec_f16); - return o; - } - - // load addr tail to __m256 reg - inline __m256 mm256_uni_loadu_tail_ps(const float* a, const size_t count) { - auto mask = get_mask(count); - return _mm256_maskload_ps(a, mask); - } - - inline __m256 mm256_uni_loadu_tail_ps(const ov::bfloat16* a, const size_t count) { - assert("AVX2 version of bfloat16 tail load is just for compilation pass"); - ov::bfloat16 tmp_values[8] = {0}; - std::memcpy(tmp_values, a, count * sizeof(ov::bfloat16)); - return mm256_uni_loadu_ps(tmp_values); - } - - inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) { - ov::float16 tmp_values[8] = {0}; - std::memcpy(tmp_values, a, count * sizeof(ov::float16)); - return mm256_uni_loadu_ps(tmp_values); - } - - // store __m256 reg to addr - inline void mm256_uni_storeu_ps(float* a, __m256 v) { - _mm256_storeu_ps(a, v); - } - - inline void mm256_uni_storeu_ps(ov::bfloat16 *addr, __m256 xps) { - __m256i xpi32 = _mm256_castps_si256(xps); - __m256i nan = _mm256_set1_epi32(0xffff); - __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(xps, xps, _CMP_ORD_Q)); - __m256i ones = _mm256_set1_epi32(0x1); - __m256i vec_bias = _mm256_set1_epi32(0x7fff); - auto x = _mm256_and_si256(_mm256_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm256_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm256_srli_epi32(_mm256_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm256_blendv_epi8(nan, x, mask); // Check NaN before converting back to bf16 - x = _mm256_packus_epi32(x, x); - x = _mm256_permute4x64_epi64(x, 0xd8); - __m128i bf16_o = _mm256_extractf128_si256(x, 0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(addr), bf16_o); - } - - inline void mm256_uni_storeu_ps(ov::float16* a, __m256 v) { - __m128i vec_f16 = _mm256_cvtps_ph(v, 0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(a), vec_f16); - } - - // store __m256 to addr - inline void mm256_uni_storeu_tail_ps(float *addr, __m256 v, size_t count) { - const auto mask = get_mask(count); - return _mm256_maskstore_ps(addr, mask, v); - } - - inline void hsum(__m256& x) { - __m256 y; // x: 0 1 2 3 4 5 6 7 - y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 - x = _mm256_add_ps(x, y); // X: 01 12 23 30 45 56 67 74 - y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 - x = _mm256_add_ps(x, y); // x: 0123 x x x 4567 x x x - y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x - x = _mm256_add_ps(x, y); // x: 01234567 x x x x x x x - } - inline void hmax(__m256& x) { - __m256 y; // x: 0 1 2 3 4 5 6 7 - y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 - x = _mm256_max_ps(x, y); // X: 01 12 23 30 45 56 67 74 - y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 - x = _mm256_max_ps(x, y); // x: 0123 x x x 4567 x x x - y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x - x = _mm256_max_ps(x, y); // x: 01234567 x x x x x x x - } - inline void hmin(__m256& x) { - __m256 y; // x: 0 1 2 3 4 5 6 7 - y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 - x = _mm256_min_ps(x, y); // X: 01 12 23 30 45 56 67 74 - y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 - x = _mm256_min_ps(x, y); // x: 0123 x x x 4567 x x x - y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x - x = _mm256_min_ps(x, y); // x: 01234567 x x x x x x x - } +inline __m256i get_mask(int N7) { + static __m256i mask[] = { + _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0), + _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1), + _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1), + _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1), + _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1), + _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1), + _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), + _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1), + _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1), + }; + return _mm256_loadu_si256(&mask[N7]); +} + +// load addr to __m256 reg +inline __m256 mm256_uni_loadu_ps(const float* a) { + return _mm256_loadu_ps(a); +} + +inline __m256 mm256_uni_loadu_ps(const ov::bfloat16* a) { + auto vec_bf16 = _mm_loadu_si128(reinterpret_cast(a)); + auto o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(vec_bf16), 16)); + return o; +} + +inline __m256 mm256_uni_loadu_ps(const ov::float16* a) { + auto vec_f16 = _mm_loadu_si128(reinterpret_cast(a)); + auto o = _mm256_cvtph_ps(vec_f16); + return o; +} + +// load addr tail to __m256 reg +inline __m256 mm256_uni_loadu_tail_ps(const float* a, const size_t count) { + auto mask = get_mask(count); + return _mm256_maskload_ps(a, mask); +} + +inline __m256 mm256_uni_loadu_tail_ps(const ov::bfloat16* a, const size_t count) { + assert("AVX2 version of bfloat16 tail load is just for compilation pass"); + ov::bfloat16 tmp_values[8] = {0}; + std::memcpy(tmp_values, a, count * sizeof(ov::bfloat16)); + return mm256_uni_loadu_ps(tmp_values); +} + +inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) { + ov::float16 tmp_values[8] = {0}; + std::memcpy(tmp_values, a, count * sizeof(ov::float16)); + return mm256_uni_loadu_ps(tmp_values); +} + +// store __m256 reg to addr +inline void mm256_uni_storeu_ps(float* a, __m256 v) { + _mm256_storeu_ps(a, v); +} + +inline void mm256_uni_storeu_ps(ov::bfloat16* addr, __m256 xps) { + __m256i xpi32 = _mm256_castps_si256(xps); + __m256i nan = _mm256_set1_epi32(0xffff); + __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(xps, xps, _CMP_ORD_Q)); + __m256i ones = _mm256_set1_epi32(0x1); + __m256i vec_bias = _mm256_set1_epi32(0x7fff); + auto x = _mm256_and_si256(_mm256_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm256_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm256_srli_epi32(_mm256_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm256_blendv_epi8(nan, x, mask); // Check NaN before converting back to bf16 + x = _mm256_packus_epi32(x, x); + x = _mm256_permute4x64_epi64(x, 0xd8); + __m128i bf16_o = _mm256_extractf128_si256(x, 0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(addr), bf16_o); +} + +inline void mm256_uni_storeu_ps(ov::float16* a, __m256 v) { + __m128i vec_f16 = _mm256_cvtps_ph(v, 0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(a), vec_f16); +} + +// store __m256 to addr +inline void mm256_uni_storeu_tail_ps(float* addr, __m256 v, size_t count) { + const auto mask = get_mask(count); + return _mm256_maskstore_ps(addr, mask, v); +} + +inline void hsum(__m256& x) { + __m256 y; // x: 0 1 2 3 4 5 6 7 + y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 + x = _mm256_add_ps(x, y); // X: 01 12 23 30 45 56 67 74 + y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 + x = _mm256_add_ps(x, y); // x: 0123 x x x 4567 x x x + y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x + x = _mm256_add_ps(x, y); // x: 01234567 x x x x x x x +} +inline void hmax(__m256& x) { + __m256 y; // x: 0 1 2 3 4 5 6 7 + y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 + x = _mm256_max_ps(x, y); // X: 01 12 23 30 45 56 67 74 + y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 + x = _mm256_max_ps(x, y); // x: 0123 x x x 4567 x x x + y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x + x = _mm256_max_ps(x, y); // x: 01234567 x x x x x x x +} +inline void hmin(__m256& x) { + __m256 y; // x: 0 1 2 3 4 5 6 7 + y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 + x = _mm256_min_ps(x, y); // X: 01 12 23 30 45 56 67 74 + y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 + x = _mm256_min_ps(x, y); // x: 0123 x x x 4567 x x x + y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x + x = _mm256_min_ps(x, y); // x: 01234567 x x x x x x x +} #endif #ifdef OPENVINO_ARCH_ARM64 - inline float32x4_t exp_ps_neon_f32(const float32x4_t& src) { - const auto c1 = vreinterpretq_f32_u32(vdupq_n_u32(0x3f7ffff6)); - const auto c2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3efffedb)); - const auto c3 = vreinterpretq_f32_u32(vdupq_n_u32(0x3e2aaf33)); - const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(0x3d2b9f17)); - const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(0x3c072010)); - - const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f - const auto one = vdupq_n_f32(1.0f); // 1 - const auto two = vdupq_n_f32(2.0f); // 2 - const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); - const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); - const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); - - const auto inf = vdupq_n_f32(std::numeric_limits::infinity()); - const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5) - const auto zero = vdupq_n_f32(0.f); - const auto min_input = vdupq_n_f32(-86.64f); // Approximately ln(2^-125) - - const auto z = vmlaq_f32(shift, src, inv_ln2); - auto n = z - shift; - n = vsubq_f32(n, one); - const auto scale = vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n - - const auto r_hi = vfmaq_f32(src, n, neg_ln2_hi); - const auto r = vfmaq_f32(r_hi, n, neg_ln2_lo); - - const auto r2 = r * r; - - const auto p1 = c1 * r; - const auto p23 = vfmaq_f32(c2, c3, r); - const auto p45 = vfmaq_f32(c4, c5, r); - const auto p2345 = vfmaq_f32(p23, p45, r2); - const auto p12345 = vfmaq_f32(p1, p2345, r2); - - auto poly = vfmaq_f32(scale, p12345, scale); - poly = vmulq_f32(poly, two); - - poly = vbslq_f32(vcltq_f32(src, min_input), zero, poly); - poly = vbslq_f32(vcgtq_f32(src, max_input), inf, poly); - - return poly; - } - inline float32x4_t __vld1q_f32(const ov::bfloat16* a) { - uint16x4_t vec_bf16 = vld1_u16(reinterpret_cast(a)); - - float32x4_t vec_f32 = vcvtq_f32_u32(vmovl_u16(vec_bf16)); - return vec_f32; - } - inline float32x4_t __vld1q_f32(const float* a) { - return vld1q_f32(a); - } - inline float32x4_t __vld1q_f32(const ov::float16* a) { - auto _a = reinterpret_cast(a); - return vcvt_f32_f16(vld1_f16(_a)); - } - inline void __vst1q_f32(float* a, float32x4_t b) { - vst1q_f32(a, b); - } - inline void __vst1q_f32(ov::float16* a, float32x4_t b) { - float16x4_t v_f16 = vcvt_f16_f32(b); - vst1_f16(reinterpret_cast(a), v_f16); - } - inline void __vst1q_f32(ov::bfloat16* a, float32x4_t b) { - uint32x4_t v_int32 = vreinterpretq_u32_f32(b); - uint16x4_t v_bf16 = vshrn_n_u32(v_int32, 16); - - vst1_u16(reinterpret_cast(a), v_bf16); - } +# if defined(HAVE_SVE) +inline svfloat32_t exp_ps_sve(svbool_t& pg, svfloat32_t& src) { + // Constants + const auto log2_e = svdup_n_f32(1.4426950409f); + const auto ln2 = svdup_n_f32(0.6931473921f); + const auto half_ln2_sq = svdup_n_f32(0.2413862043f); + const auto not_mask17 = svdup_n_u32(~((1u << 17) - 1)); + const auto one = svdup_n_f32(1.0f); + + // Algorithm starts here + svfloat32_t t0 = svmul_f32_z(pg, src, log2_e); // y = x * log2(e) + svfloat32_t t1 = svrintm_f32_z(pg, t0); // rount to int (float) + svint32_t t2 = svcvt_s32_f32_z(pg, t1); // n + + t1 = svsub_f32_z(pg, t0, t1); // a = y - floor(y) + t1 = svadd_f32_z(pg, t1, one); // b = a + 1 + + svuint32_t t3 = svlsr_n_u32_z(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32) + svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v) + t4 = svscale_f32_z(pg, t4, t2); // fexpa(v) * 2^(n) + + // and_(t2.d, t1.d, not_mask17.d) + svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_z(pg, svreinterpret_u32_f32(t1), not_mask17)); + t5 = svsub_f32_z(pg, t1, t5); // z + t0 = svmla_f32_z(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z + t0 = svmla_f32_z(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z) + t0 = svmul_f32_z(pg, t0, t4); // Final result + + return t0; +} +inline svfloat32_t exp_ps_sve_legacy(svbool_t& pg, svfloat32_t& src) { + const auto c1 = svreinterpret_f32_u32(svdup_n_u32(0x3f7ffff6)); + const auto c2 = svreinterpret_f32_u32(svdup_n_u32(0x3efffedb)); + const auto c3 = svreinterpret_f32_u32(svdup_n_u32(0x3e2aaf33)); + const auto c4 = svreinterpret_f32_u32(svdup_n_u32(0x3d2b9f17)); + const auto c5 = svreinterpret_f32_u32(svdup_n_u32(0x3c072010)); + + const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto one = svdup_n_f32(1.0f); // 1 + const auto two = svdup_n_f32(2.0f); // 2 + const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); + const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); + const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); + + const auto inf = svdup_n_f32(std::numeric_limits::infinity()); + const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) + const auto zero = svdup_n_f32(0.f); + const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) + + const auto z = svmla_f32_z(pg, shift, src, inv_ln2); + auto n = svsub_f32_z(pg, z, shift); + n = svsub_f32_z(pg, n, one); + const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n + + const auto r_hi = svmla_f32_z(pg, src, n, neg_ln2_hi); + const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); + const auto r2 = svmul_f32_z(pg, r, r); + + const auto p1 = svmul_f32_z(pg, c1, r); + const auto p23 = svmla_f32_z(pg, c2, c3, r); + const auto p45 = svmla_f32_z(pg, c4, c5, r); + const auto p2345 = svmla_f32_z(pg, p23, p45, r2); + const auto p12345 = svmla_f32_z(pg, p1, p2345, r2); + + auto poly = svmla_f32_z(pg, scale, p12345, scale); + poly = svmul_f32_z(pg, poly, two); + + poly = svsel_f32(svcmplt_f32(pg, src, min_input), zero, poly); + poly = svsel_f32(svcmpgt_f32(pg, src, max_input), inf, poly); + + return poly; +} +# endif +inline float32x4_t exp_ps_neon_f32(const float32x4_t& src) { + const auto c1 = vreinterpretq_f32_u32(vdupq_n_u32(0x3f7ffff6)); + const auto c2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3efffedb)); + const auto c3 = vreinterpretq_f32_u32(vdupq_n_u32(0x3e2aaf33)); + const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(0x3d2b9f17)); + const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(0x3c072010)); + + const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto one = vdupq_n_f32(1.0f); // 1 + const auto two = vdupq_n_f32(2.0f); // 2 + const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); + const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); + const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); + + const auto inf = vdupq_n_f32(std::numeric_limits::infinity()); + const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5) + const auto zero = vdupq_n_f32(0.f); + const auto min_input = vdupq_n_f32(-86.64f); // Approximately ln(2^-125) + + const auto z = vmlaq_f32(shift, src, inv_ln2); + auto n = z - shift; + n = vsubq_f32(n, one); + const auto scale = vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n + + const auto r_hi = vfmaq_f32(src, n, neg_ln2_hi); + const auto r = vfmaq_f32(r_hi, n, neg_ln2_lo); + + const auto r2 = r * r; + + const auto p1 = c1 * r; + const auto p23 = vfmaq_f32(c2, c3, r); + const auto p45 = vfmaq_f32(c4, c5, r); + const auto p2345 = vfmaq_f32(p23, p45, r2); + const auto p12345 = vfmaq_f32(p1, p2345, r2); + + auto poly = vfmaq_f32(scale, p12345, scale); + poly = vmulq_f32(poly, two); + + poly = vbslq_f32(vcltq_f32(src, min_input), zero, poly); + poly = vbslq_f32(vcgtq_f32(src, max_input), inf, poly); + + return poly; +} +inline float32x4_t __vld1q_f32(const ov::bfloat16* a) { + uint16x4_t vec_bf16 = vld1_u16(reinterpret_cast(a)); + + float32x4_t vec_f32 = vcvtq_f32_u32(vmovl_u16(vec_bf16)); + return vec_f32; +} +inline float32x4_t __vld1q_f32(const float* a) { + return vld1q_f32(a); +} +inline float32x4_t __vld1q_f32(const ov::float16* a) { + auto _a = reinterpret_cast(a); + return vcvt_f32_f16(vld1_f16(_a)); +} +inline void __vst1q_f32(float* a, float32x4_t b) { + vst1q_f32(a, b); +} +inline void __vst1q_f32(ov::float16* a, float32x4_t b) { + float16x4_t v_f16 = vcvt_f16_f32(b); + vst1_f16(reinterpret_cast(a), v_f16); +} +inline void __vst1q_f32(ov::bfloat16* a, float32x4_t b) { + uint32x4_t v_int32 = vreinterpretq_u32_f32(b); + uint16x4_t v_bf16 = vshrn_n_u32(v_int32, 16); + + vst1_u16(reinterpret_cast(a), v_bf16); +} #endif #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - inline float16x8_t exp_ps_neon_f16(float16x8_t x) { - const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); - const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); - - // We use f32 to maintain accuracy - const float16x8_t res = vcombine_f16(vcvt_f16_f32(exp_ps_neon_f32(x_low)), vcvt_f16_f32(exp_ps_neon_f32(x_high))); - return res; - } - inline float16_t hsum(float16x8_t vec) { - float16x4_t sum1 = vpadd_f16(vget_low_f16(vec), vget_high_f16(vec)); - float16x4_t sum2 = vpadd_f16(sum1, sum1); - float16x4_t sum3 = vpadd_f16(sum2, sum2); - return vget_lane_f16(sum3, 0); - } +inline float16x8_t exp_ps_neon_f16(float16x8_t x) { + const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); + const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); + + // We use f32 to maintain accuracy + const float16x8_t res = vcombine_f16(vcvt_f16_f32(exp_ps_neon_f32(x_low)), vcvt_f16_f32(exp_ps_neon_f32(x_high))); + return res; +} +inline float16_t hsum(float16x8_t vec) { + float16x4_t sum1 = vpadd_f16(vget_low_f16(vec), vget_high_f16(vec)); + float16x4_t sum2 = vpadd_f16(sum1, sum1); + float16x4_t sum3 = vpadd_f16(sum2, sum2); + return vget_lane_f16(sum3, 0); +} #endif } // namespace XARCH } // namespace Cpu diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index bef34881ca41bc..a74021d8ac0d05 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -13,19 +13,19 @@ # include #endif -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/type/float16.hpp" -#include "openvino/core/parallel.hpp" +#include "attn_memcpy.hpp" +#include "attn_quant.hpp" +#include "attn_quant_kernel.hpp" +#include "common.hpp" #include "executor_pa.hpp" #include "executor_pa_common.hpp" -#include "common.hpp" -#include "attn_quant_kernel.hpp" +#include "nodes/kernels/x64/brgemm_kernel.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" +#include "openvino/core/type/float16.hpp" #include "softmax_kernel.hpp" #include "transpose_kernel.hpp" #include "utils/plain_tensor.hpp" -#include "attn_memcpy.hpp" -#include "attn_quant.hpp" -#include "nodes/kernels/x64/brgemm_kernel.hpp" namespace ov { namespace Extensions { @@ -38,42 +38,43 @@ using namespace ov::intel_cpu; // currently depends on brgemm which only support x64 #ifdef OPENVINO_ARCH_X86_64 -#if defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# if defined(HAVE_AVX2) || defined(HAVE_AVX512F) -#define prefetch_bytes(bytes, sel, advance, src) { \ - auto *p = reinterpret_cast(src); \ - for (size_t i = 0; i < bytes; i += 64) \ - _mm_prefetch(p + i + advance, sel); \ -} +# define prefetch_bytes(bytes, sel, advance, src) \ + { \ + auto* p = reinterpret_cast(src); \ + for (size_t i = 0; i < bytes; i += 64) \ + _mm_prefetch(p + i + advance, sel); \ + } -#else +# else -#define prefetch_bytes(bytes, sel, advance, src) +# define prefetch_bytes(bytes, sel, advance, src) -#endif +# endif -template +template void cvt_copy(TA* dst, TB* src, size_t n) { size_t i = 0; -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { auto vb = mm512_uni_loadu_ps(src + i); mm512_uni_storeu_ps(dst + i, vb); } -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) { auto vb = mm256_uni_loadu_ps(src + i); mm256_uni_storeu_ps(dst + i, vb); } -#endif +# endif for (; i < n; i++) { dst[i] = src[i]; } } -template +template static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size_t block_size) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto attn_w_vec0 = _mm512_set1_ps(weight[0]); @@ -132,7 +133,7 @@ static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size } } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto attn_w_vec0 = _mm256_set1_ps(weight[0]); @@ -191,7 +192,7 @@ static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size } } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { for (size_t i = 0; i < S; i++) { out[i] += weight[j] * v[i]; @@ -202,9 +203,9 @@ static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S, size_t block_size) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) -#if defined(HAVE_AVX512F) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto v_f0 = reinterpret_cast(v); @@ -223,10 +224,18 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S v += 8; for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto v_out = mm512_uni_loadu_ps(out + i); - auto v0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), zp0); - auto v1 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + S + 8)))), zp1); - auto v2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 2 * (S + 8))))), zp2); - auto v3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 3 * (S + 8))))), zp3); + auto v0 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), + zp0); + auto v1 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + S + 8)))), + zp1); + auto v2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 2 * (S + 8))))), + zp2); + auto v3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 3 * (S + 8))))), + zp3); v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out); v_out = _mm512_fmadd_ps(attn_w_vec1, v1, v_out); v_out = _mm512_fmadd_ps(attn_w_vec2, v2, v_out); @@ -251,7 +260,9 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S v += 8; for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto v_out = mm512_uni_loadu_ps(out + i); - auto v0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), zp0); + auto v0 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), + zp0); v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out); _mm512_storeu_ps(out + i, v_out); @@ -263,7 +274,7 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S weight++; } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j < block_size; j++) { auto v_f0 = reinterpret_cast(v); @@ -273,7 +284,9 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S v += 8; for (; i + vec_len_f32_avx2 <= S; i += vec_len_f32_avx2) { auto v_out = mm256_uni_loadu_ps(out + i); - auto v0 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(v + i)))), zp0); + auto v0 = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(v + i)))), + zp0); v_out = _mm256_fmadd_ps(attn_w_vec0, v0, v_out); mm256_uni_storeu_ps(out + i, v_out); @@ -285,7 +298,7 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S weight++; } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { auto v0 = reinterpret_cast(v); v += 8; @@ -296,9 +309,9 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S } } -template +template static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_size) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm512_setzero_ps(); @@ -328,7 +341,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz c[2] = sum2; c[3] = sum3; c += 4; - b += 4 * n; + b += 4 * n; } for (; j < block_size; j++) { auto vsum = _mm512_setzero_ps(); @@ -345,7 +358,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz *c++ = sum; } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm256_set1_ps(0.0f); @@ -379,7 +392,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz c[2] = sum2; c[3] = sum3; c += 4; - b += 4 * n; + b += 4 * n; } for (; j < block_size; j++) { auto vsum = _mm256_set1_ps(0.0f); @@ -397,7 +410,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz *c++ = sum; } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { float sum = 0; for (size_t i = 0; i < n; i++) { @@ -408,12 +421,12 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz } } -template +template static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t block_size) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) -#if defined(HAVE_AVX512F) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm512_setzero_ps(); @@ -432,10 +445,18 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { auto va = mm512_uni_loadu_ps(a + i); - auto vb0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), v_zp0); - auto vb1 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + n + 8)))), v_zp1); - auto vb2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), v_zp2); - auto vb3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), v_zp3); + auto vb0 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), + v_zp0); + auto vb1 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + n + 8)))), + v_zp1); + auto vb2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), + v_zp2); + auto vb3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), + v_zp3); vsum0 = _mm512_fmadd_ps(va, vb0, vsum0); vsum1 = _mm512_fmadd_ps(va, vb1, vsum1); @@ -457,7 +478,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc c[2] = sum2 * b2[0]; c[3] = sum3 * b3[0]; c += 4; - b += 4 * (n + 8) - 8; + b += 4 * (n + 8) - 8; } for (; j < block_size; j++) { auto vsum = _mm512_setzero_ps(); @@ -467,7 +488,9 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { auto va = mm512_uni_loadu_ps(a + i); - auto vb = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), v_zp); + auto vb = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), + v_zp); vsum = _mm512_fmadd_ps(va, vb, vsum); } float sum = _mm512_reduce_add_ps(vsum); @@ -478,7 +501,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc *c++ = sum * b0[0]; } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm256_setzero_ps(); @@ -497,10 +520,18 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) { auto va = mm256_uni_loadu_ps(a + i); - auto vb0 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), v_zp0); - auto vb1 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + n + 8)))), v_zp1); - auto vb2 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), v_zp2); - auto vb3 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), v_zp3); + auto vb0 = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), + v_zp0); + auto vb1 = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + n + 8)))), + v_zp1); + auto vb2 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32( + _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), + v_zp2); + auto vb3 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32( + _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), + v_zp3); vsum0 = _mm256_fmadd_ps(va, vb0, vsum0); vsum1 = _mm256_fmadd_ps(va, vb1, vsum1); @@ -526,7 +557,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc c[2] = sum2 * b2[0]; c[3] = sum3 * b3[0]; c += 4; - b += 4 * (n + 8) - 8; + b += 4 * (n + 8) - 8; } for (; j < block_size; j++) { auto vsum = _mm256_setzero_ps(); @@ -536,7 +567,9 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) { auto va = mm256_uni_loadu_ps(a + i); - auto vb = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), v_zp); + auto vb = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), + v_zp); vsum = _mm256_fmadd_ps(va, vb, vsum); } hsum(vsum); @@ -548,7 +581,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc *c++ = sum * b0[0]; } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { float sum = 0; auto b0 = reinterpret_cast(b); @@ -561,11 +594,11 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc } } -template +template static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_stride) { size_t i = 0; -#if defined(HAVE_AVX512F) - for (; i + vec_len_f32_avx512 <= S; i+= vec_len_f32_avx512) { +# if defined(HAVE_AVX512F) + for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto* src = temp + i; auto result_vec_fp32 = _mm512_setzero_ps(); for (size_t m = 0; m < M; m++) { @@ -576,7 +609,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str // save to bf16 mm512_uni_storeu_ps(dst + i, result_vec_fp32); } -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) for (; i + vec_len_f32_avx2 <= S; i += vec_len_f32_avx2) { auto* src = temp + i; auto result_vec_fp32 = _mm256_set1_ps(0.0f); @@ -587,7 +620,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str } mm256_uni_storeu_ps(dst + i, result_vec_fp32); } -#endif +# endif for (; i < S; i++) { auto* src = temp + i; float sum = 0.0f; @@ -601,7 +634,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str } // N must be multiple of 16 -template +template void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { size_t k = 0; for (; k + 16 <= K; k += 16) { @@ -619,24 +652,26 @@ void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t } } -#if defined(HAVE_AVX512F) -template::value || std::is_same::value), bool>::type> +# if defined(HAVE_AVX512F) +template ::value || std::is_same::value), bool>::type> static void transpose_16NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // will treat as uint32_t transpose auto s = reinterpret_cast(src); auto d = reinterpret_cast(dst); transpose_16NxK(d, s, reinterpret_cast(0), N, K >> 1, dst_stride, src_stride >> 1); } -#endif +# endif -template +template void transpose_16NxK(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) auto s = src; auto t = tmp; - for (size_t n = 0; n < N; n ++) { + for (size_t n = 0; n < N; n++) { auto f = reinterpret_cast(s); attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]); s += src_stride + 2 * sizeof(float); @@ -646,7 +681,7 @@ void transpose_16NxK(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, siz } // dequant f16/u8 to float -template +template static inline void dequant(T* dst, T* src, size_t N, size_t K) { // never called OPENVINO_THROW("dequant: should not be called."); @@ -656,13 +691,13 @@ static inline void dequant(float* dst, ov::float16* src, size_t N, size_t K) { cvt_copy(dst, src, K * N); } -template +template void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) auto s = src; - for (size_t n = 0; n < N; n ++) { + for (size_t n = 0; n < N; n++) { auto f = reinterpret_cast(s); attn_dequant_u8_kernel(s + 2 * sizeof(float), dst, K, f[0], f[1]); s += K + 2 * sizeof(float); @@ -670,18 +705,24 @@ void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) { } } -#if defined(HAVE_AVX512F) -template::value || std::is_same::value), bool>::type> +# if defined(HAVE_AVX512F) +template ::value || std::is_same::value), bool>::type> static void pack_32x32_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { - auto a = _mm512_loadu_si512(src); // [a1 a2 a3 a4 | a5 a6 a7 a8] total 512-bits in 8 64bits unit + auto a = _mm512_loadu_si512(src); // [a1 a2 a3 a4 | a5 a6 a7 a8] total 512-bits in 8 64bits unit auto b = _mm512_loadu_si512(src + src_stride); // [b1 b2 b3 b4 | b5 b6 b7 b8] total 512-bits a = _mm512_permutexvar_epi64(midx, a); // [a1 a5 | a2 a6 | a3 a7 | a4 a8] b = _mm512_permutexvar_epi64(midx, b); // [b1 b5 | b2 b6 | b3 b7 | b4 b8] - auto B0 = _mm512_unpacklo_epi16(a, b); // [ a1&b1 a2&b2 a3&b3 a4&b4] for each 128-bits lane, interleave word in low 64 bits - auto B1 = _mm512_unpackhi_epi16(a, b); // [ a5&b5 a6&b6 a7&b7 a8&b8] for each 128-bits lane, interleave word in high 64 bits + auto B0 = _mm512_unpacklo_epi16( + a, + b); // [ a1&b1 a2&b2 a3&b3 a4&b4] for each 128-bits lane, interleave word in low 64 bits + auto B1 = _mm512_unpackhi_epi16( + a, + b); // [ a5&b5 a6&b6 a7&b7 a8&b8] for each 128-bits lane, interleave word in high 64 bits _mm512_storeu_si512(dst, B0); _mm512_storeu_si512(dst + 32, B1); src += 2 * src_stride; @@ -689,17 +730,20 @@ static void pack_32x32_kernel(T* dst, T* src, size_t dst_stride, size_t src_stri } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32x16_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { - auto x = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src)); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit + auto x = + _mm256_loadu_si256(reinterpret_cast<__m256i*>(src)); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit auto y = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src + src_stride)); // [b1 b2 b3 b4] total 256-bits auto a = _mm512_castsi256_si512(x); auto b = _mm512_castsi256_si512(y); - a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] - b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] + a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] + b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] auto B0 = _mm512_unpacklo_epi16(a, b); _mm512_storeu_si512(dst, B0); src += 2 * src_stride; @@ -707,18 +751,20 @@ static void pack_32x16_kernel(T* dst, T* src, size_t dst_stride, size_t src_stri } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32xK_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride, size_t K) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); __mmask16 mask = (1 << K) - 1; for (size_t i = 0; i < K; i++) { - auto x = _mm256_maskz_loadu_epi16(mask, src); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit - auto y = _mm256_maskz_loadu_epi16(mask, src + src_stride); // [b1 b2 b3 b4] total 256-bits + auto x = _mm256_maskz_loadu_epi16(mask, src); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit + auto y = _mm256_maskz_loadu_epi16(mask, src + src_stride); // [b1 b2 b3 b4] total 256-bits auto a = _mm512_castsi256_si512(x); auto b = _mm512_castsi256_si512(y); - a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] - b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] + a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] + b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] auto B0 = _mm512_unpacklo_epi16(a, b); _mm512_mask_storeu_epi32(dst, mask, B0); src += 2 * src_stride; @@ -726,7 +772,9 @@ static void pack_32xK_kernel(T* dst, T* src, size_t dst_stride, size_t src_strid } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { for (size_t n = 0; n < N; n += 32) { size_t k = 0; @@ -746,14 +794,16 @@ static void pack_32NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_st } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32NxK(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) auto s = src; auto t = tmp; - for (size_t n = 0; n < N; n ++) { + for (size_t n = 0; n < N; n++) { auto f = reinterpret_cast(s); attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]); s += src_stride + 2 * sizeof(float); @@ -761,9 +811,9 @@ static void pack_32NxK(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t } pack_32NxK(dst, tmp, reinterpret_cast(0), N, K, dst_stride, src_stride); } -#endif +# endif -template +template static void pack_32NxK(float* dst, T* src, float* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // never called OPENVINO_THROW("pack_32NxK: should not be called."); @@ -782,10 +832,10 @@ struct MHAHelper { size_t _sliding_window; float _d_scale; - PlainTensor _weight; // [nthr, H, 32, rnd_up(kv_len, block_size)], shared by first and second loop along bh - PlainTensor _output; // [nthr, 32, H, S], shared by first and second loop along bh - PlainTensor _qk_scratch_a; // [nthr, scratch_a_size] - PlainTensor _qk_scratch_b; // [B, rnd_up(kv_len, block_size), Hk, scratch_b_size] + PlainTensor _weight; // [nthr, H, 32, rnd_up(kv_len, block_size)], shared by first and second loop along bh + PlainTensor _output; // [nthr, 32, H, S], shared by first and second loop along bh + PlainTensor _qk_scratch_a; // [nthr, scratch_a_size] + PlainTensor _qk_scratch_b; // [B, rnd_up(kv_len, block_size), Hk, scratch_b_size] PlainTensor _wv_scratch_a; PlainTensor _wv_scratch_b; PlainTensor _alibi_lookup; @@ -810,8 +860,16 @@ struct MHAHelper { _weight.resize({size_t{1}, size_t{1}, size_t{1}, size_t{1}}); } - void init(size_t H, size_t S, size_t SV, size_t Hk, size_t h_each_group_len, size_t block_size, size_t sliding_window, - float d_scale, size_t kv_len, bool init_alibi_lookup) { + void init(size_t H, + size_t S, + size_t SV, + size_t Hk, + size_t h_each_group_len, + size_t block_size, + size_t sliding_window, + float d_scale, + size_t kv_len, + bool init_alibi_lookup) { // query shape: [B, H, L, S] // present_key shape: [block, H, 32, S] // Q*K': [M1, S] * [M2, S]' @@ -853,25 +911,27 @@ struct MHAHelper { _weight.stride(2), false, in_type); - _wv_gemm[i] = std::make_shared(i + 1, - _SV, - _block_size, - // if it's bf16, the stride needs double due to reuse float buffer - (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), - _SV, - _output.stride(1), - false, - in_type); - _wv_gemm_acc[i] = std::make_shared(i + 1, - _SV, - _block_size, - // if it's bf16, the stride needs double due to reuse float buffer - (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), - _SV, - _output.stride(1), - false, - in_type, - true); + _wv_gemm[i] = + std::make_shared(i + 1, + _SV, + _block_size, + // if it's bf16, the stride needs double due to reuse float buffer + (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), + _SV, + _output.stride(1), + false, + in_type); + _wv_gemm_acc[i] = + std::make_shared(i + 1, + _SV, + _block_size, + // if it's bf16, the stride needs double due to reuse float buffer + (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), + _SV, + _output.stride(1), + false, + in_type, + true); } // wsp is used to compute beta when K is blocked @@ -879,8 +939,10 @@ struct MHAHelper { _wsp.resize(_nthr * _wsp_size_per_thread); // allocate scratch a/b, notice get_scratch_a_size/get_scratch_b_size returns in bytes - _qk_scratch_a.resize({_nthr, _qk_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); - _wv_scratch_a.resize({_nthr, _wv_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); + _qk_scratch_a.resize( + {_nthr, _qk_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); + _wv_scratch_a.resize( + {_nthr, _wv_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); if ((S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6)) { if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) && @@ -894,14 +956,16 @@ struct MHAHelper { } } if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16) && !_gemv) { - _gemv = std::make_shared(static_cast(S), static_cast(block_size), _fastpath_valid_prec); + _gemv = std::make_shared(static_cast(S), + static_cast(block_size), + _fastpath_valid_prec); } } if (init_alibi_lookup && (!_alibi_lookup || _alibi_lookup.m_dims[0] < kv_len)) { _alibi_lookup.resize({kv_len * 2}); for (size_t i = 0; i < _alibi_lookup.m_dims[0]; i++) - _alibi_lookup.ptr()[i] = - static_cast((_alibi_lookup.m_dims[0] - 1 - i)); + _alibi_lookup.ptr()[i] = -static_cast((_alibi_lookup.m_dims[0] - 1 - i)); } } @@ -937,16 +1001,28 @@ struct MHAHelper { // output_emb: [L, H * S] // qk_scratch_b: [rnd_up(kv_len, block_size), Hk, scratch_b_size] // wv_scratch_b: [rnd_up(kv_len, block_size), Hk, scratch_b_size] - void exec_kernel_multiple(const PlainTensor& query, const PlainTensor& present_value, const PlainTensor& output_emb, - const PlainTensor& qk_scratch_b, const PlainTensor& wv_scratch_b, const int32_t* block_table, size_t ithr, size_t q_blk, - size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { + void exec_kernel_multiple(const PlainTensor& query, + const PlainTensor& present_value, + const PlainTensor& output_emb, + const PlainTensor& qk_scratch_b, + const PlainTensor& wv_scratch_b, + const int32_t* block_table, + size_t ithr, + size_t q_blk, + size_t hq_beg, + size_t hq_end, + size_t hk, + size_t q_len, + size_t cur_kv_len, + const PlainTensor& alibi_slopes, + float* score_output) { auto q_start = q_blk * _block_size; auto q_end = std::min(q_start + _block_size, q_len); auto q_cnt = q_end - q_start; constexpr bool q_is_xf16 = one_of(precision_of::value, ov::element::bf16, ov::element::f16); constexpr bool q_cache_is_same = precision_of::value == precision_of::value; auto cur_kv_len_blocks = div_up(cur_kv_len, _block_size); - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { auto* q_ptr = query.ptr(h, q_start, 0); float* c_ptr = _weight.ptr(ithr, h, 0, 0); // for each query block, loop through all key block @@ -1012,13 +1088,16 @@ struct MHAHelper { alibi_slope); } if (score_output) { - cvt_copy(score_output + h * rnd_up(cur_kv_len, 16), reinterpret_cast(score), cur_kv_len); + cvt_copy(score_output + h * rnd_up(cur_kv_len, 16), + reinterpret_cast(score), + cur_kv_len); } } // reuse float buffer, need to use float to compute offset auto* w_ptr = reinterpret_cast(_weight.ptr(ithr, h, 0, 0)); - float* fp32_out_ptr = q_is_xf16 ? _output.ptr(ithr, 0, h, 0) : output_emb.ptr(q_start, h * _SV); + float* fp32_out_ptr = + q_is_xf16 ? _output.ptr(ithr, 0, h, 0) : output_emb.ptr(q_start, h * _SV); // for each weight block, loop through all value block for (size_t v_blk = 0; v_blk < cur_kv_len_blocks; v_blk++) { @@ -1036,12 +1115,13 @@ struct MHAHelper { _wsp.data() + ithr * _wsp_size_per_thread, _wv_scratch_a ? _wv_scratch_a.ptr(ithr, 0) : nullptr); } else { - _wv_gemm_acc[q_cnt - 1]->executeGemm(q_cnt < _block_size, - w_ptr + v_blk * _block_size, - v_ptr, - fp32_out_ptr, - _wsp.data() + ithr * _wsp_size_per_thread, - _wv_scratch_a ? _wv_scratch_a.ptr(ithr, 0) : nullptr); + _wv_gemm_acc[q_cnt - 1]->executeGemm( + q_cnt < _block_size, + w_ptr + v_blk * _block_size, + v_ptr, + fp32_out_ptr, + _wsp.data() + ithr * _wsp_size_per_thread, + _wv_scratch_a ? _wv_scratch_a.ptr(ithr, 0) : nullptr); } } if (q_is_xf16) { @@ -1064,16 +1144,28 @@ struct MHAHelper { // output_emb: [L, H * S] // weight: [nthr, H, 32, rnd_up(kv_len, block_size)] // output: [nthr, 32, H, S] - void exec_kernel_one_bh(const PlainTensor& query, const PlainTensor& present_key, const PlainTensor& present_value, const PlainTensor& output_emb, - const int32_t* block_table, size_t ithr, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { + void exec_kernel_one_bh(const PlainTensor& query, + const PlainTensor& present_key, + const PlainTensor& present_value, + const PlainTensor& output_emb, + const int32_t* block_table, + size_t ithr, + size_t hq_beg, + size_t hq_end, + size_t hk, + size_t q_len, + size_t cur_kv_len, + const PlainTensor& alibi_slopes, + float* score_output) { if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - (*_gemv)(query.ptr(h, pq), present_key.ptr(block_number, hk), - _weight.ptr(ithr, h, pq) + pk); + for (size_t h = hq_beg; h < hq_end; h++) { + (*_gemv)(query.ptr(h, pq), + present_key.ptr(block_number, hk), + _weight.ptr(ithr, h, pq) + pk); } } } @@ -1082,16 +1174,19 @@ struct MHAHelper { for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - dot_product_block(query.ptr(h, pq), present_key.ptr(block_number, hk), - _weight.ptr(ithr, h, pq) + pk, _S, std::min(_block_size, cur_kv_len - pk)); + for (size_t h = hq_beg; h < hq_end; h++) { + dot_product_block(query.ptr(h, pq), + present_key.ptr(block_number, hk), + _weight.ptr(ithr, h, pq) + pk, + _S, + std::min(_block_size, cur_kv_len - pk)); } } } } for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { // apply attention mask & sofmax float* alibi_lookup = nullptr; float alibi_slope = 0.f; @@ -1112,7 +1207,9 @@ struct MHAHelper { ov::element::f32, alibi_slope); if (score_output) { - memcpy(score_output + h * rnd_up(cur_kv_len, 16), _weight.ptr(ithr, h, pq), cur_kv_len * sizeof(float)); + memcpy(score_output + h * rnd_up(cur_kv_len, 16), + _weight.ptr(ithr, h, pq), + cur_kv_len * sizeof(float)); } } } @@ -1122,7 +1219,7 @@ struct MHAHelper { auto block_number = block_table[i]; auto* v = present_value.ptr(block_number, hk); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { attn_acc_value_block(_output.ptr(ithr, pq, h), _weight.ptr(ithr, h, pq) + pv, v, @@ -1133,13 +1230,13 @@ struct MHAHelper { } // convert to dst for (size_t pq = 0; pq < q_len; pq++) - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) + for (size_t h = hq_beg; h < hq_end; h++) cvt_copy(output_emb.ptr(pq, h * _SV), _output.ptr(ithr, pq, h), _SV); } - // compute one token, loop along batch, head dimensions and kv_len, it's special for very long kv_len with small batch tokens. - // It will assume NO mixture execution of first and second token. - // all tensors such as query... have batch dimension which is DIFFERENT from above + // compute one token, loop along batch, head dimensions and kv_len, it's special for very long kv_len with small + // batch tokens. It will assume NO mixture execution of first and second token. all tensors such as query... have + // batch dimension which is DIFFERENT from above // query: [B, H, L, S] // present_*: [block_number, H, 32, S] // output_emb: [B, L, H * S] @@ -1162,8 +1259,39 @@ struct MHAHelper { // aligned to cache line (64bytes=16*sizeof(float)) to avoid false sharing _weight_bhl.resize({B, _H, q_len, rnd_up(max_context_len, std::max(_block_size, size_t{16}))}); - parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pk_in_blocks, size_t hk) { + // for small batches dynamic scheduler has notable overhead + bool prefer_static_loop; + // if less than 2 work items per thread, loop H + bool loop_hk = B * kv_len_in_blocks * _Hk <= 2 * _nthr ? false : true; + if (B <= 32) { + prefer_static_loop = true; + // small batch and all batch size is same(like SDPA case) + auto kv_len = past_lens.ptr()[0]; + for (size_t b = 1; b < B; b++) { + if (past_lens.ptr()[b] != kv_len) + prefer_static_loop = false; + } + } else { + // for bigger batch skip the test to save the cost + prefer_static_loop = false; + } + auto get_h_params = + [](bool loop_hk, size_t hx, size_t h_each_group_len, size_t& hq_beg, size_t& hq_end, size_t& hk) { + if (loop_hk) { + hk = hx; + hq_beg = hk * h_each_group_len; + hq_end = (hk + 1) * h_each_group_len; + } else { + hq_beg = hx; + hq_end = hx + 1; + hk = hx / h_each_group_len; + } + }; + auto loop_qk = [&](size_t b, size_t pk_in_blocks, size_t hx) { auto context_len = static_cast(past_lens.ptr()[b]) + 1; + size_t hk, hq_beg, hq_end; + get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk); + // kv_len must be valid auto pk = pk_in_blocks * _block_size; if (pk < context_len) { @@ -1171,24 +1299,28 @@ struct MHAHelper { if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - (*_gemv)(query.ptr(b, h, pq), present_key.ptr(block_number, hk), - _weight_bhl.ptr(b, h, pq) + pk); + for (size_t h = hq_beg; h < hq_end; h++) { + (*_gemv)(query.ptr(b, h, pq), + present_key.ptr(block_number, hk), + _weight_bhl.ptr(b, h, pq) + pk); } } _gemv->tile_release(); } else { for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - dot_product_block(query.ptr(b, h, pq), present_key.ptr(block_number, hk), - _weight_bhl.ptr(b, h, pq) + pk, _S, std::min(_block_size, context_len - pk)); + for (size_t h = hq_beg; h < hq_end; h++) { + dot_product_block(query.ptr(b, h, pq), + present_key.ptr(block_number, hk), + _weight_bhl.ptr(b, h, pq) + pk, + _S, + std::min(_block_size, context_len - pk)); } } } } - }); + }; - parallel_for3d_dynamic(B, _H, q_len, [&](size_t b, size_t h, size_t pq) { + auto loop_softmax = [&](size_t b, size_t h, size_t pq) { auto cur_kv_len = static_cast(past_lens.ptr()[b]) + 1; auto ncausal = cur_kv_len; // apply attention mask & sofmax @@ -1210,7 +1342,16 @@ struct MHAHelper { ov::element::f32, ov::element::f32, alibi_slope); - }); + }; + + size_t h_dims = loop_hk ? _Hk : _H; + if (prefer_static_loop) { + parallel_for3d(B, kv_len_in_blocks, h_dims, loop_qk); + parallel_for3d(B, _H, q_len, loop_softmax); + } else { + parallel_for3d_dynamic(B, kv_len_in_blocks, h_dims, loop_qk); + parallel_for3d_dynamic(B, _H, q_len, loop_softmax); + } if (output_score) { parallel_for2d_dynamic(B, q_len, [&](size_t b, size_t pq) { @@ -1229,16 +1370,19 @@ struct MHAHelper { memset(_output_bhl.ptr(ithr, 0, 0, 0, 0), 0, _output_bhl.stride(0) * sizeof(float)); }); - parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pv_in_blocks, size_t hk) { + auto loop_wk = [&](size_t b, size_t pv_in_blocks, size_t hx) { auto ithr = parallel_get_thread_num(); auto context_len = static_cast(past_lens.ptr()[b]) + 1; auto pv = pv_in_blocks * _block_size; + size_t hk, hq_beg, hq_end; + get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk); + // kv_len must be valid if (pv < context_len) { auto block_number = block_indices.ptr()[block_indices_begins.ptr()[b] + pv_in_blocks]; auto* v = present_value.ptr(block_number, hk); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { attn_acc_value_block(_output_bhl.ptr(ithr, b, pq, h), _weight_bhl.ptr(b, h, pq) + pv, v, @@ -1247,7 +1391,13 @@ struct MHAHelper { } } } - }); + }; + + if (prefer_static_loop) { + parallel_for3d(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk); + } else { + parallel_for3d_dynamic(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk); + } parallel_for3d(B, _H, q_len, [&](size_t b, size_t h, size_t pq) { auto* temp = _output_bhl.ptr(0, b, pq, h); @@ -1262,26 +1412,29 @@ template struct MHA { MHAHelper& _helper; struct AttnWorkItem { - int32_t batch_in_reorder; // which batch in reorder buffer will be used - int32_t batch_in_seq; // batch idx in sequence - int32_t q_len; // current sequence length, 1 for second token, 2+ for first token - int32_t q_block_id; // block id in this seq, valid at first token + int32_t batch_in_reorder; // which batch in reorder buffer will be used + int32_t batch_in_seq; // batch idx in sequence + int32_t q_len; // current sequence length, 1 for second token, 2+ for first token + int32_t q_block_id; // block id in this seq, valid at first token }; struct ReorderWorkItem { - int32_t batch_in_seq; // batch idx in sequence - int32_t batch_in_reorder; // which batch in reorder buffer will be used - int32_t kv_block_id; // block id in this kv cache seq + int32_t batch_in_seq; // batch idx in sequence + int32_t batch_in_reorder; // which batch in reorder buffer will be used + int32_t kv_block_id; // block id in this kv cache seq }; struct WorkItems { private: std::vector attn_items; std::vector reorder_items; - int32_t max_kv_len_in_reorder; // max kv len between first tokens + int32_t max_kv_len_in_reorder; // max kv len between first tokens int32_t max_batch_in_reorder; int32_t total_kv_len; public: - void reset(const PlainTensor& query, const PlainTensor& past_lens, const PlainTensor& subsequence_begins, size_t block_size) { + void reset(const PlainTensor& query, + const PlainTensor& past_lens, + const PlainTensor& subsequence_begins, + size_t block_size) { attn_items.clear(); reorder_items.clear(); max_kv_len_in_reorder = 0; @@ -1294,21 +1447,19 @@ struct MHA { auto kv_len = past_lens.ptr()[i] + q_len; auto kv_len_in_block = static_cast(div_up(kv_len, block_size)); if (q_len == 1) { - attn_items.emplace_back(AttnWorkItem{ - 0, // batch_in_reorder - i, // batch_in_seq - 1ull, // q_len - // kv_len in blocks, used in the sort function - kv_len_in_block - 1 - }); + attn_items.emplace_back(AttnWorkItem{0, // batch_in_reorder + i, // batch_in_seq + 1ull, // q_len + // kv_len in blocks, used in the sort function + kv_len_in_block - 1}); } else { auto reorder_sub_work_count = kv_len_in_block; max_kv_len_in_reorder = std::max(max_kv_len_in_reorder, kv_len); for (int32_t block_id = 0; block_id < reorder_sub_work_count; block_id++) { reorder_items.emplace_back(ReorderWorkItem{ - i, // batch_in_seq - max_batch_in_reorder, // batch_in_reorder - block_id // kv_block_id + i, // batch_in_seq + max_batch_in_reorder, // batch_in_reorder + block_id // kv_block_id }); } @@ -1316,17 +1467,18 @@ struct MHA { auto attn_sub_work_count = static_cast(div_up(q_len, block_size)); for (int32_t block_id = 0; block_id < attn_sub_work_count; block_id++) { attn_items.emplace_back(AttnWorkItem{ - max_batch_in_reorder, // batch_in_reorder - i, // batch_in_seq - q_len, // q_len - block_id // q_block_id + max_batch_in_reorder, // batch_in_reorder + i, // batch_in_seq + q_len, // q_len + block_id // q_block_id }); } max_batch_in_reorder++; } total_kv_len += kv_len; } - // std::sort(attn_items.begin(), attn_items.end(), [] (const AttnWorkItem& left, const AttnWorkItem& right) { + // std::sort(attn_items.begin(), attn_items.end(), [] (const AttnWorkItem& left, const AttnWorkItem& right) + // { // // kv block number which will be acessed later // auto left_kv_blocks = left.q_block_id; // auto right_kv_blocks = right.q_block_id; @@ -1380,7 +1532,8 @@ struct MHA { auto reorder_work_count = _workitems.reorder_work_size(); // buffer for transpose and repack - _helper.init_reorder_buffers(_workitems.get_reorder_max_batch_size(), div_up(_workitems.get_reorder_max_kv_len(), _helper._block_size)); + _helper.init_reorder_buffers(_workitems.get_reorder_max_batch_size(), + div_up(_workitems.get_reorder_max_kv_len(), _helper._block_size)); // packed k, v parallel_for2d_dynamic(reorder_work_count, Hk, [&](size_t w, size_t hk) { @@ -1388,7 +1541,8 @@ struct MHA { const auto batch_in_seq = item.batch_in_seq; const auto batch_in_reorder = item.batch_in_reorder; const auto kv_block = item.kv_block_id; - auto block_number = block_indices.ptr()[block_indices_begins.ptr()[batch_in_seq] + kv_block]; + auto block_number = + block_indices.ptr()[block_indices_begins.ptr()[batch_in_seq] + kv_block]; if (block_number < 0) return; @@ -1396,10 +1550,12 @@ struct MHA { auto* k_ptr = k_cache.ptr(block_number, hk); auto* v_ptr = v_cache.ptr(block_number, hk); transpose_16NxK(_helper._qk_scratch_b.template ptr(batch_in_reorder, kv_block, hk), - k_ptr, - _helper._output.template ptr(ithr), - _helper._block_size, - _helper._S, _helper._block_size, _helper._S); + k_ptr, + _helper._output.template ptr(ithr), + _helper._block_size, + _helper._S, + _helper._block_size, + _helper._S); if (q_is_xf16) { pack_32NxK(_helper._wv_scratch_b.template ptr(batch_in_reorder, kv_block, hk), v_ptr, @@ -1411,12 +1567,34 @@ struct MHA { } else { // need to decompress if (!q_cache_is_same) { - dequant(_helper._wv_scratch_b.template ptr(batch_in_reorder, kv_block, hk), v_ptr, _helper._block_size, _helper._SV); + dequant(_helper._wv_scratch_b.template ptr(batch_in_reorder, kv_block, hk), + v_ptr, + _helper._block_size, + _helper._SV); } } }); - parallel_for2d_dynamic(attn_work_count, Hk, [&](size_t w, size_t hk) { + // loop along HK dimension: if mixed first/second token and elements count is enough, loop HK to reuse KV in the + // CPU cache + // else if elements count is small, prefer to loop H to get more work to avoid thread imbalance + bool loop_hk = _workitems.get_reorder_max_batch_size() == past_lens.m_dims[0] || // if only first token, loop H + attn_work_count * Hk <= 2 * _helper._nthr + ? false + : true; // or less than 2 work items per thread, loop H + + parallel_for2d_dynamic(attn_work_count, loop_hk ? Hk : _helper._H, [&](size_t w, size_t hx) { + size_t hk, hq_beg, hq_end; + if (loop_hk) { + hk = hx; + hq_beg = hk * _helper._h_each_group_len; + hq_end = (hk + 1) * _helper._h_each_group_len; + } else { + hq_beg = hx; + hq_end = hx + 1; + hk = hx / _helper._h_each_group_len; + } + const auto& item = _workitems.get_attn_work_item(w); const auto batch_in_seq = item.batch_in_seq; const auto batch_in_token = subsequence_begins.ptr()[batch_in_seq]; @@ -1431,16 +1609,26 @@ struct MHA { score_output = _helper._score_output.template ptr() + score_offset * _helper._H; } - _helper.exec_kernel_one_bh(q.slice(0, batch_in_token, batch_in_token), k_cache, v_cache, + _helper.exec_kernel_one_bh( + q.slice(0, batch_in_token, batch_in_token), + k_cache, + v_cache, output_emb.slice(0, batch_in_token, batch_in_token), block_indices.ptr() + block_indices_begins.ptr()[batch_in_seq], - ithr, hk, 1ul, cur_kv_len, alibi_slopes, + ithr, + hq_beg, + hq_end, + hk, + 1ul, + cur_kv_len, + alibi_slopes, score_output); } else { const auto batch_in_reorder = item.batch_in_reorder; const auto q_blk = item.q_block_id; const auto q_cnt = std::min(_helper._block_size, q_len - q_blk * _helper._block_size); - const auto cur_kv_len = static_cast(past_lens.ptr()[batch_in_seq]) + q_blk * _helper._block_size + q_cnt; + const auto cur_kv_len = + static_cast(past_lens.ptr()[batch_in_seq]) + q_blk * _helper._block_size + q_cnt; float* score_output = nullptr; if (output_score) { // last block @@ -1453,14 +1641,18 @@ struct MHA { PlainTensor sub_query; sub_query.resize({q_len, _helper._H, _helper._S}, q.ptr(batch_in_token)); sub_query = sub_query.permute({1, 0, 2}); - _helper.exec_kernel_multiple(sub_query, + _helper.exec_kernel_multiple( + sub_query, v_cache, - output_emb.slice(0, batch_in_token, batch_in_token + q_len).reshape({q_len, _helper._H * _helper._SV}), + output_emb.slice(0, batch_in_token, batch_in_token + q_len) + .reshape({q_len, _helper._H * _helper._SV}), _helper._qk_scratch_b.slice(0, batch_in_reorder, batch_in_reorder), _helper._wv_scratch_b.slice(0, batch_in_reorder, batch_in_reorder), block_indices.ptr() + block_indices_begins.ptr()[batch_in_seq], ithr, q_blk, + hq_beg, + hq_end, hk, q_len, cur_kv_len, @@ -1470,7 +1662,8 @@ struct MHA { }); if (output_score) { parallel_for2d_dynamic(past_lens.m_dims[0], 1, [&](size_t b, size_t pq) { - auto seq_len = static_cast(subsequence_begins.ptr()[b + 1] - subsequence_begins.ptr()[b]); + auto seq_len = static_cast(subsequence_begins.ptr()[b + 1] - + subsequence_begins.ptr()[b]); auto cur_kv_len = static_cast(past_lens.ptr()[b]) + seq_len; auto src_offset = _helper._score_offsets_aligned.template ptr()[b]; auto* src = _helper._score_output.template ptr() + src_offset * _helper._H; @@ -1501,11 +1694,29 @@ struct MHA { auto nthr = static_cast(parallel_get_max_threads()); if (past_lens.m_dims[0] >= nthr || _workitems.get_reorder_max_batch_size() > 0) { - exec_loop_mixed(query, present_key, present_value, output_emb, output_score, max_context_len, past_lens, subsequence_begins, - block_indices, block_indices_begins, alibi_slopes); + exec_loop_mixed(query, + present_key, + present_value, + output_emb, + output_score, + max_context_len, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + alibi_slopes); } else { - _helper.exec_loop_bhl(query, present_key, present_value, output_emb, output_score, max_context_len, past_lens, subsequence_begins, - block_indices, block_indices_begins, alibi_slopes); + _helper.exec_loop_bhl(query, + present_key, + present_value, + output_emb, + output_score, + max_context_len, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + alibi_slopes); } } }; @@ -1518,18 +1729,32 @@ struct AttentionExecutor : public PagedAttentionExecutor { AttentionExecutor() : _kernel(_helper) {} - void init(const std::vector& inputs, const std::vector& outputs, PlainTensor& q, PlainTensor& k, PlainTensor& v, PlainTensor& k_cache, - PlainTensor& v_cache, PlainTensor& past_lens, PlainTensor& subsequence_begins, PlainTensor& block_indices, PlainTensor& block_indices_begins, - float& scale, size_t& sliding_window, PlainTensor& alibi_slopes, size_t& max_context_len, PlainTensor& output_emb, PlainTensor& output_score) { - q.reset(inputs[ID_Q]); // [B_token, H * S] + void init(const std::vector& inputs, + const std::vector& outputs, + PlainTensor& q, + PlainTensor& k, + PlainTensor& v, + PlainTensor& k_cache, + PlainTensor& v_cache, + PlainTensor& past_lens, + PlainTensor& subsequence_begins, + PlainTensor& block_indices, + PlainTensor& block_indices_begins, + float& scale, + size_t& sliding_window, + PlainTensor& alibi_slopes, + size_t& max_context_len, + PlainTensor& output_emb, + PlainTensor& output_score) { + q.reset(inputs[ID_Q]); // [B_token, H * S] k.reset(inputs[ID_K]); v.reset(inputs[ID_V]); - k_cache.reset(inputs[ID_KCACHE]); // [NUM_BLOCKS, H, 32, S] - v_cache.reset(inputs[ID_VCACHE]); // [NUM_BLOCKS, H, 32, S] - past_lens.reset(inputs[ID_PAST_LENS]); // [B_seq] - subsequence_begins.reset(inputs[ID_SUBSEQUENCE_BEGINS]); // [B_seq+1] - block_indices.reset(inputs[ID_BLOCK_INDICES]); // [num_blocks] - block_indices_begins.reset(inputs[ID_BLOCK_INDICES_BEGINS]);// [B_seq+1] + k_cache.reset(inputs[ID_KCACHE]); // [NUM_BLOCKS, H, 32, S] + v_cache.reset(inputs[ID_VCACHE]); // [NUM_BLOCKS, H, 32, S] + past_lens.reset(inputs[ID_PAST_LENS]); // [B_seq] + subsequence_begins.reset(inputs[ID_SUBSEQUENCE_BEGINS]); // [B_seq+1] + block_indices.reset(inputs[ID_BLOCK_INDICES]); // [num_blocks] + block_indices_begins.reset(inputs[ID_BLOCK_INDICES_BEGINS]); // [B_seq+1] scale = *inputs[ID_SCALE]->getDataAs(); sliding_window = static_cast(*inputs[ID_SLIDING_WINDOW]->getDataAs()); if (!inputs[ID_ALIBI_SLOPES]->getShape().hasZeroDims()) @@ -1542,8 +1767,8 @@ struct AttentionExecutor : public PagedAttentionExecutor { auto B_token = q.size(0); auto Hk = k_cache.size(1); // The layout for per token per head for u8 kv cache: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The actual size needs to deduct scale and zeropoint. + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The actual size needs to deduct scale and zeropoint. auto S = k_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0); auto SV = v_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0); auto block_size = k_cache.size(2); @@ -1585,8 +1810,14 @@ struct AttentionExecutor : public PagedAttentionExecutor { _helper.init(H, S, SV, Hk, h_each_group_len, block_size, sliding_window, scale, max_context_len, alibi_slopes); } - void concat_pastkv(const PlainTensor& k, const PlainTensor& v, const PlainTensor& k_cache, const PlainTensor& v_cache, - const PlainTensor& past_lens, const PlainTensor& subsequence_begins, const PlainTensor& block_indices, const PlainTensor& block_indices_begins) { + void concat_pastkv(const PlainTensor& k, + const PlainTensor& v, + const PlainTensor& k_cache, + const PlainTensor& v_cache, + const PlainTensor& past_lens, + const PlainTensor& subsequence_begins, + const PlainTensor& block_indices, + const PlainTensor& block_indices_begins) { auto B_token = k.size(0); _slot_mapping.resize({B_token}); @@ -1598,8 +1829,10 @@ struct AttentionExecutor : public PagedAttentionExecutor { auto block_offset_start = kv_len - q_len; for (int32_t j = 0; j < q_len; j++) { auto block_offset = block_offset_start + j; - auto block_number = block_indices.ptr()[block_number_start + block_offset / _helper._block_size]; - _slot_mapping.ptr()[idx++] = block_number * _helper._block_size + block_offset % _helper._block_size; + auto block_number = + block_indices.ptr()[block_number_start + block_offset / _helper._block_size]; + _slot_mapping.ptr()[idx++] = + block_number * _helper._block_size + block_offset % _helper._block_size; } } @@ -1620,12 +1853,36 @@ struct AttentionExecutor : public PagedAttentionExecutor { PlainTensor output_emb; PlainTensor output_score; - init(inputs, outputs, q, k, v, k_cache, v_cache, past_lens, subsequence_begins, block_indices, block_indices_begins, - scale, sliding_window, alibi_slopes, max_context_len, output_emb, output_score); + init(inputs, + outputs, + q, + k, + v, + k_cache, + v_cache, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + scale, + sliding_window, + alibi_slopes, + max_context_len, + output_emb, + output_score); concat_pastkv(k, v, k_cache, v_cache, past_lens, subsequence_begins, block_indices, block_indices_begins); - _kernel(q, k_cache, v_cache, output_emb, output_score, max_context_len, past_lens, subsequence_begins, block_indices, - block_indices_begins, alibi_slopes); + _kernel(q, + k_cache, + v_cache, + output_emb, + output_score, + max_context_len, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + alibi_slopes); } }; #endif @@ -1635,27 +1892,27 @@ std::shared_ptr make_pa_executor(ov::element::Type data_ #ifdef OPENVINO_ARCH_X86_64 if (data_type == ov::element::bf16) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) if (kvcache_type == ov::element::u8) { executor = std::make_shared>(); } else { OPENVINO_ASSERT(kvcache_type == ov::element::bf16, "expect kvcache type bf16, current: ", kvcache_type); executor = std::make_shared>(); } -#else +# else OPENVINO_THROW("make_pa_executor: bf16 needs avx512+ hardware."); -#endif +# endif } else if (data_type == ov::element::f16) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) if (kvcache_type == ov::element::u8) { executor = std::make_shared>(); } else { OPENVINO_ASSERT(kvcache_type == ov::element::f16, "expect kvcache type f16, current: ", kvcache_type); executor = std::make_shared>(); } -#else - OPENVINO_THROW("make_pa_executor: f16 needs avx512+ hardware."); -#endif +# else + OPENVINO_THROW("make_pa_executor: f16 needs avx512+ hardware."); +# endif } else if (data_type == ov::element::f32) { if (kvcache_type == ov::element::u8) { executor = std::make_shared>(); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp index ed779dee13c96d..d28125b3898460 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp @@ -6,8 +6,9 @@ #include #include #include -#include #include +#include + #include "cpu_memory.h" #include "executor_pa_common.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp index 70723a577b0c2b..8a7fa211f8f4ce 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp @@ -1,6 +1,8 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include "executor_pa_common.hpp" + #include #include @@ -9,10 +11,9 @@ #include #include +#include "openvino/core/parallel.hpp" #include "openvino/core/type/bfloat16.hpp" #include "openvino/core/type/float16.hpp" -#include "openvino/core/parallel.hpp" -#include "executor_pa_common.hpp" #include "utils/plain_tensor.hpp" namespace ov { @@ -58,20 +59,23 @@ void TileConfiger::generate() { ret(); } -JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec) : - jit_generator(jit_name()), m_head_size(head_size), m_block_size(block_size), m_amx_prec(amx_prec) { +JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec) + : jit_generator(jit_name()), + m_head_size(head_size), + m_block_size(block_size), + m_amx_prec(amx_prec) { create_kernel(); m_tile_cfg.reset(1, 0, { - {16, 4}, // C:0 M x 1 (4b) - {16, 64}, // A:1 M x 32/64 (64b) - {16, 4}, // B:2 32/64 x 1 (4b) - {16, 4}, // B:3 - {16, 4}, // B:4 - {16, 4}, // B:5 - {16, 4}, // B:6 - {16, 4}, // B:7 + {16, 4}, // C:0 M x 1 (4b) + {16, 64}, // A:1 M x 32/64 (64b) + {16, 4}, // B:2 32/64 x 1 (4b) + {16, 4}, // B:3 + {16, 4}, // B:4 + {16, 4}, // B:5 + {16, 4}, // B:6 + {16, 4}, // B:7 }); } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp index bc21457a3285b4..81c54c84d9453a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp @@ -6,11 +6,12 @@ #include #include #include -#include #include -#include "cpu_memory.h" +#include + #include "cpu/x64/cpu_isa_traits.hpp" #include "cpu/x64/jit_generator.hpp" +#include "cpu_memory.h" namespace ov { namespace Extensions { @@ -20,20 +21,21 @@ namespace Cpu { struct PagedAttentionExecutor { // PagedAttention input index - static const size_t ID_Q = 0; // [B_token, H * S], float - static const size_t ID_K = 1; // [B_token, Hk * S], float - static const size_t ID_V = 2; // [B_token, Hk * S], float - static const size_t ID_KCACHE = 3; // [block_number, H, block_size, S], float - static const size_t ID_VCACHE = 4; // [block_number, H, block_size, S], float - static const size_t ID_PAST_LENS = 5; // [B_seq] - static const size_t ID_SUBSEQUENCE_BEGINS = 6; // [B_seq+1] - static const size_t ID_BLOCK_INDICES = 7; // [num_blocks] - static const size_t ID_BLOCK_INDICES_BEGINS = 8; // [B_seq+1] - static const size_t ID_SCALE = 9; // [], float - static const size_t ID_SLIDING_WINDOW = 10; // [] - static const size_t ID_ALIBI_SLOPES = 11; // [H|0], float - static const size_t ID_MAX_CONTEXT_LEN = 12; // [] - virtual void execute(const std::vector& inputs, const std::vector outputs) = 0; + static const size_t ID_Q = 0; // [B_token, H * S], float + static const size_t ID_K = 1; // [B_token, Hk * S], float + static const size_t ID_V = 2; // [B_token, Hk * S], float + static const size_t ID_KCACHE = 3; // [block_number, H, block_size, S], float + static const size_t ID_VCACHE = 4; // [block_number, H, block_size, S], float + static const size_t ID_PAST_LENS = 5; // [B_seq] + static const size_t ID_SUBSEQUENCE_BEGINS = 6; // [B_seq+1] + static const size_t ID_BLOCK_INDICES = 7; // [num_blocks] + static const size_t ID_BLOCK_INDICES_BEGINS = 8; // [B_seq+1] + static const size_t ID_SCALE = 9; // [], float + static const size_t ID_SLIDING_WINDOW = 10; // [] + static const size_t ID_ALIBI_SLOPES = 11; // [H|0], float + static const size_t ID_MAX_CONTEXT_LEN = 12; // [] + virtual void execute(const std::vector& inputs, + const std::vector outputs) = 0; virtual ~PagedAttentionExecutor() = default; }; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 25ddbb1b4246b1..5a6f0d66f1f221 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -13,14 +14,16 @@ # include #endif - -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/parallel.hpp" -#include "mha_single_token.hpp" #include "common.hpp" +#include "mha_single_token.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" #include "softmax_kernel.hpp" #if defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) +# include +# endif # include #endif @@ -33,19 +36,20 @@ using namespace ov; #if defined(HAVE_AVX2) -#define prefetch_bytes(bytes, sel, advance, src) { \ - auto *p = reinterpret_cast(src); \ - for (size_t i = 0; i < bytes; i += 64) \ - _mm_prefetch(p + i + advance, sel); \ -} +# define prefetch_bytes(bytes, sel, advance, src) \ + { \ + auto* p = reinterpret_cast(src); \ + for (size_t i = 0; i < bytes; i += 64) \ + _mm_prefetch(p + i + advance, sel); \ + } #else -#define prefetch_bytes(bytes, sel, advance, src) +# define prefetch_bytes(bytes, sel, advance, src) #endif -template +template void cvt_copy(TA* dst, TB* src, size_t n) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -59,27 +63,43 @@ void cvt_copy(TA* dst, TB* src, size_t n) { mm256_uni_storeu_ps(dst + i, vb); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + auto _dst = reinterpret_cast(dst); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < n) { + if (n - i < vec_len_f32_sve) { + inc = n - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + svfloat32_t b1 = svld1_f32(pg, src + i); + svst1_f32(pg, _dst + i, b1); + i += inc; + } +# else if (std::is_same::value && std::is_same::value) { for (; i + vec_len_f32_neon <= n; i += vec_len_f32_neon) { float32x4_t vb1 = __vld1q_f32(src + i); __vst1q_f32(dst + i, vb1); } } -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +# if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) if (std::is_same::value && std::is_same::value) { for (; i + vec_len_f16_neon <= n; i += vec_len_f16_neon) { auto vb1 = vld1q_f16(reinterpret_cast(src + i)); vst1q_f16(reinterpret_cast(dst + i), vb1); } } -#endif +# endif +# endif #endif for (; i < n; i++) { dst[i] = src[i]; } } -template +template static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scale, float* zp) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -99,6 +119,27 @@ static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scal mm256_uni_storeu_ps(out + i, v_out); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + auto _v = reinterpret_cast(v); + svfloat32_t attn_w_vec_fp32 = svdup_n_f32(weight); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < S) { + if (S - i < vec_len_f32_sve) { + inc = S - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + svfloat32_t v_value = svld1_f32(pg, _v + i); + svfloat32_t v_out = svld1_f32(pg, out + i); + + // svmla with merging to preserve inactive lane values when there's ... + // fewer than vec_len elements left + v_out = svmla_f32_m(pg, v_out, attn_w_vec_fp32, v_value); + svst1_f32(pg, out + i, v_out); + i += inc; + } +# else float32x4_t attn_w_vec_fp32 = vdupq_n_f32(weight); for (; i + vec_len_f32_neon <= S; i += vec_len_f32_neon) { float32x4_t v_value = __vld1q_f32(v + i); @@ -106,6 +147,7 @@ static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scal v_out = vmlaq_f32(v_out, attn_w_vec_fp32, v_value); __vst1q_f32(out + i, v_out); } +# endif #endif for (; i < S; i++) { out[i] += weight * v[i]; @@ -113,12 +155,12 @@ static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scal } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -template +template static void attn_acc_value(ov::float16* out, ov::float16 weight, T* v, size_t S, float* scale, float* zp) { size_t i = 0; auto attn_w_vec_fp16 = vdupq_n_f16(weight); - auto _v = reinterpret_cast(v); - auto _out = reinterpret_cast(out); + auto _v = reinterpret_cast(v); + auto _out = reinterpret_cast(out); for (; i + vec_len_f16_neon <= S; i += vec_len_f16_neon) { auto v_value = vld1q_f16(_v + i); auto v_out = vld1q_f16(_out + i); @@ -131,7 +173,6 @@ static void attn_acc_value(ov::float16* out, ov::float16 weight, T* v, size_t S, } #endif - static void attn_acc_value(float* out, float weight, uint8_t* v, size_t S, float* scale, float* zp) { size_t i = 0; weight *= *scale; @@ -285,7 +326,7 @@ static void attn_acc_value(float* out, float weight, uint8_t* v, size_t S, float } } -template +template static float sum_q_head(T* a, size_t n) { float sum = 0.0f; size_t i = 0; @@ -357,7 +398,50 @@ static float sum_q_head(T* a, size_t n) { hsum(vsum0); sum = _mm256_cvtss_f32(vsum0); #elif defined(OPENVINO_ARCH_ARM64) - size_t vec_len_f32_neon = 4; +# if defined(HAVE_SVE) + svfloat32_t sum0 = svdup_n_f32(0.0f); + svfloat32_t sum1 = svdup_n_f32(0.0f); + svfloat32_t sum2 = svdup_n_f32(0.0f); + svfloat32_t sum3 = svdup_n_f32(0.0f); + svbool_t pg = svptrue_b32(); + + for (; i + 4 * vec_len_f32_sve <= n; i += 4 * vec_len_f32_sve) { + svfloat32_t a0 = svld1_f32(pg, a + i); + svfloat32_t a1 = svld1_f32(pg, a + i + vec_len_f32_sve); + svfloat32_t a2 = svld1_f32(pg, a + i + vec_len_f32_sve * 2); + svfloat32_t a3 = svld1_f32(pg, a + i + vec_len_f32_sve * 3); + + sum0 = svadd_f32_z(pg, a0, sum0); + sum1 = svadd_f32_z(pg, a1, sum1); + sum2 = svadd_f32_z(pg, a2, sum2); + sum3 = svadd_f32_z(pg, a3, sum3); + } + if (i + 2 * vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, a + i); + svfloat32_t a1 = svld1_f32(pg, a + i + vec_len_f32_sve); + + sum0 = svadd_f32_z(pg, a0, sum0); + sum1 = svadd_f32_z(pg, a1, sum1); + i += 2 * vec_len_f32_sve; + } + if (i + vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, a + i); + sum0 = svadd_f32_z(pg, a0, sum0); + i += vec_len_f32_sve; + } + // Process tail elements parallely as well (if any) + if (i != n) { + svbool_t pg_rem = svwhilelt_b32(0, static_cast(n - i)); + svfloat32_t a0 = svld1_f32(pg_rem, a + i); + sum0 = svadd_f32_m(pg_rem, sum0, a0); + i = n; + } + float32_t sum_0 = svaddv_f32(pg, sum0); + float32_t sum_1 = svaddv_f32(pg, sum1); + float32_t sum_2 = svaddv_f32(pg, sum2); + float32_t sum_3 = svaddv_f32(pg, sum3); + sum = static_cast(sum_0 + sum_1 + sum_2 + sum_3); +# else float32x4_t vsum0 = vdupq_n_f32(0.0f); float32x4_t vsum1 = vdupq_n_f32(0.0f); float32x4_t vsum2 = vdupq_n_f32(0.0f); @@ -397,8 +481,8 @@ static float sum_q_head(T* a, size_t n) { sum_low = vadd_f32(sum_low, sum_high); sum_low = vpadd_f32(sum_low, sum_low); sum = vget_lane_f32(sum_low, 0); +# endif #endif - for (; i < n; i++) { float tmp = a[i]; sum += tmp; @@ -406,7 +490,7 @@ static float sum_q_head(T* a, size_t n) { return sum; } -template +template static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* head_sum) { size_t i = 0; float sum = 0.0f; @@ -497,6 +581,63 @@ static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* sum = _mm256_cvtss_f32(vsum0); #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + svbool_t pg = svptrue_b32(); + svfloat32_t sum0 = svdup_n_f32(0.0f); + svfloat32_t sum1 = svdup_n_f32(0.0f); + svfloat32_t sum2 = svdup_n_f32(0.0f); + svfloat32_t sum3 = svdup_n_f32(0.0f); + + auto _a = reinterpret_cast(a); + auto _b = reinterpret_cast(b); + + for (; i + 4 * vec_len_f32_sve <= n; i += 4 * vec_len_f32_sve) { + svfloat32_t a0 = svld1_f32(pg, _a + i); + svfloat32_t a1 = svld1_f32(pg, _a + i + vec_len_f32_sve); + svfloat32_t a2 = svld1_f32(pg, _a + i + vec_len_f32_sve * 2); + svfloat32_t a3 = svld1_f32(pg, _a + i + vec_len_f32_sve * 3); + + svfloat32_t b0 = svld1_f32(pg, _b + i); + svfloat32_t b1 = svld1_f32(pg, _b + i + vec_len_f32_sve); + svfloat32_t b2 = svld1_f32(pg, _b + i + vec_len_f32_sve * 2); + svfloat32_t b3 = svld1_f32(pg, _b + i + vec_len_f32_sve * 3); + + sum0 = svmla_f32_z(pg, sum0, a0, b0); + sum1 = svmla_f32_z(pg, sum1, a1, b1); + sum2 = svmla_f32_z(pg, sum2, a2, b2); + sum3 = svmla_f32_z(pg, sum3, a3, b3); + } + if (i + 2 * vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, _a + i); + svfloat32_t a1 = svld1_f32(pg, _a + i + vec_len_f32_sve); + + svfloat32_t b0 = svld1_f32(pg, _b + i); + svfloat32_t b1 = svld1_f32(pg, _b + i + vec_len_f32_sve); + + sum0 = svmla_f32_z(pg, sum0, a0, b0); + sum1 = svmla_f32_z(pg, sum1, a1, b1); + i += 2 * vec_len_f32_sve; + } + if (i + vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, _a + i); + svfloat32_t b0 = svld1_f32(pg, _b + i); + sum0 = svmla_f32_z(pg, sum0, a0, b0); + i += vec_len_f32_sve; + } + // Process the tail elements parallely as well (if any) + if (i != n) { + svbool_t pg_rem = svwhilelt_b32(0, static_cast(n - i)); + svfloat32_t a0 = svld1_f32(pg_rem, _a + i); + svfloat32_t b0 = svld1_f32(pg_rem, _b + i); + sum0 = svmla_f32_m(pg_rem, sum0, a0, b0); + i = n; + } + float32_t sum_0 = svaddv_f32(pg, sum0); + float32_t sum_1 = svaddv_f32(pg, sum1); + float32_t sum_2 = svaddv_f32(pg, sum2); + float32_t sum_3 = svaddv_f32(pg, sum3); + sum = static_cast(sum_0 + sum_1 + sum_2 + sum_3); +# else float32x4_t vsum0 = vdupq_n_f32(0.0f); float32x4_t vsum1 = vdupq_n_f32(0.0f); float32x4_t vsum2 = vdupq_n_f32(0.0f); @@ -543,8 +684,8 @@ static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* float32x2_t temp_sum = vadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0)); temp_sum = vpadd_f32(temp_sum, temp_sum); sum = vget_lane_f32(temp_sum, 0); +# endif #endif - for (; i < n; i++) { sum += a[i] * b[i]; } @@ -552,7 +693,12 @@ static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -static ov::float16 dot_product_fp16(ov::float16* a, ov::float16* b, size_t n, float* scale, float* zp, float* head_sum) { +static ov::float16 dot_product_fp16(ov::float16* a, + ov::float16* b, + size_t n, + float* scale, + float* zp, + float* head_sum) { size_t i = 0; ov::float16 sum = 0.0f; auto vsum0 = vdupq_n_f16(0.0f); @@ -609,7 +755,7 @@ static ov::float16 dot_product_fp16(ov::float16* a, ov::float16* b, size_t n, fl } #endif -template +template static float dot_product(TA* a, uint8_t* b, size_t n, float* scale, float* zp, float* head_sum) { size_t i = 0; float sum = 0.0f; @@ -763,11 +909,11 @@ static float dot_product(TA* a, uint8_t* b, size_t n, float* scale, float* zp, f #endif } -template +template static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_stride) { size_t i = 0; #if defined(HAVE_AVX512F) - for (; i + vec_len_f32_avx512 <= S; i+= vec_len_f32_avx512) { + for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto* src = temp + i; auto result_vec_fp32 = _mm512_setzero_ps(); for (size_t m = 0; m < M; m++) { @@ -790,6 +936,28 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str mm256_uni_storeu_ps(dst + i, result_vec_fp32); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + auto _dst = reinterpret_cast(dst); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < S) { + if (S - i < vec_len_f32_sve) { + inc = S - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + auto* src = temp + i; + auto result_vec_fp32 = svdup_n_f32(0.0f); + + for (size_t m = 0; m < M; m++) { + auto o_vec_fp32 = svld1_f32(pg, src); + result_vec_fp32 = svadd_f32_m(pg, result_vec_fp32, o_vec_fp32); + src += temp_stride; + } + svst1_f32(pg, _dst + i, result_vec_fp32); + i += inc; + } +# else for (; i + vec_len_f32_neon <= S; i += vec_len_f32_neon) { auto* src = temp + i; auto result_vec_fp32 = vdupq_n_f32(0.0f); @@ -800,6 +968,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str } __vst1q_f32(dst + i, result_vec_fp32); } +# endif #endif for (; i < S; i++) { auto* src = temp + i; @@ -903,11 +1072,16 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, for (size_t iwork = start; iwork < end; ++iwork) { auto p = past_k_scale_zp.ptr(pk, 0, h_group); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - if (std::is_same::value && std::is_same::value && std::is_same::value) { + if (std::is_same::value && std::is_same::value && + std::is_same::value) { auto p_k = present_key.ptr(0, h_group, pk); prefetch_bytes(S, _MM_HINT_T0, 4096, p_k); - auto _qk = dot_product_fp16(query.ptr(0, h_group), p_k, - S, p, p + 1, head_sum.ptr(0, h_group)); + auto _qk = dot_product_fp16(query.ptr(0, h_group), + p_k, + S, + p, + p + 1, + head_sum.ptr(0, h_group)); buf_attn_w.ptr(0, h_group, 0)[pk] = _qk; parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); continue; @@ -915,8 +1089,9 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, #endif auto p_k = present_key.ptr(0, h_group, pk); prefetch_bytes(S, _MM_HINT_T0, 4096, p_k); - buf_attn_w.ptr(0, h_group, 0)[pk] = dot_product(query.ptr(0, h_group), p_k, - S, p, p + 1, head_sum.ptr(0, h_group));; + buf_attn_w.ptr(0, h_group, 0)[pk] = + dot_product(query.ptr(0, h_group), p_k, S, p, p + 1, head_sum.ptr(0, h_group)); + ; parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); } } else { @@ -924,10 +1099,15 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, auto b_kv = beams ? beams.ptr(b)[pk] : b; auto p = past_k_scale_zp.ptr(pk, b_kv, h_group); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - if (std::is_same::value && std::is_same::value && std::is_same::value) { + if (std::is_same::value && std::is_same::value && + std::is_same::value) { auto p_k = present_key.ptr(b_kv, h_group, pk); - auto _qk = dot_product_fp16(query.ptr(b, h_group), p_k, - S, p, p + 1, head_sum.ptr(b, h_group)); + auto _qk = dot_product_fp16(query.ptr(b, h_group), + p_k, + S, + p, + p + 1, + head_sum.ptr(b, h_group)); buf_attn_w.ptr(b, h_group, 0)[pk] = _qk; parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); continue; @@ -935,8 +1115,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, #endif auto p_k = present_key.ptr(b_kv, h_group, pk); buf_attn_w.ptr(b, h_group, 0)[pk] = - dot_product(query.ptr(b, h_group), p_k, - S, p, p + 1, head_sum.ptr(b, h_group)); + dot_product(query.ptr(b, h_group), p_k, S, p, p + 1, head_sum.ptr(b, h_group)); parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); } } @@ -947,17 +1126,25 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, auto p = past_k_scale_zp.ptr(pk, b_kv, h_group); for (size_t h = h_group * h_each_group_len; h < (h_group + 1) * h_each_group_len; h++) { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - if (std::is_same::value && std::is_same::value && std::is_same::value) { + if (std::is_same::value && std::is_same::value && + std::is_same::value) { auto p_k = present_key.ptr(b_kv, h_group, pk); - auto _qk = dot_product_fp16(query.ptr(b, h, pq), p_k, - S, p, p + 1, head_sum.ptr(b, h, pq)); + auto _qk = dot_product_fp16(query.ptr(b, h, pq), + p_k, + S, + p, + p + 1, + head_sum.ptr(b, h, pq)); buf_attn_w.ptr(b, h, pq)[pk] = _qk; continue; } #endif - buf_attn_w.ptr(b, h, pq)[pk] = - dot_product(query.ptr(b, h, pq), present_key.ptr(b_kv, h_group, pk), - S, p, p + 1, head_sum.ptr(b, h, pq)); + buf_attn_w.ptr(b, h, pq)[pk] = dot_product(query.ptr(b, h, pq), + present_key.ptr(b_kv, h_group, pk), + S, + p, + p + 1, + head_sum.ptr(b, h, pq)); } } parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); @@ -1001,7 +1188,8 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, auto* v = present_value.ptr(b_kv, h_group, pv); auto p = past_v_scale_zp.ptr(pv, b_kv, h_group); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = h_group * h_each_group_len, group_idx = 0; h < (h_group + 1) * h_each_group_len; h++, group_idx++) { + for (size_t h = h_group * h_each_group_len, group_idx = 0; h < (h_group + 1) * h_each_group_len; + h++, group_idx++) { attn_acc_value(buf_attn_score.ptr(ithr, pq, group_idx), buf_attn_w.ptr(b, h, pq)[pv], v, @@ -1014,7 +1202,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, // convert to dst for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = h_group * h_each_group_len, group_idx = 0; h < (h_group + 1) * h_each_group_len; - h++, group_idx++) { + h++, group_idx++) { auto* dst = has_out_transpose ? output_emb.ptr(b, pq, h * SV) : output_emb.ptr(b, h, pq); cvt_copy(dst, buf_attn_score.ptr(ithr, pq, group_idx), SV); } @@ -1239,6 +1427,7 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, OPENVINO_THROW("Unsupported precision: ", query.get_precision()); } } + } // namespace XARCH } // namespace Cpu } // namespace Extensions diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp index e29e2bae0aa07a..2ef0f62d7e0df0 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp @@ -6,8 +6,9 @@ #include #include #include -#include #include +#include + #include "utils/plain_tensor.hpp" namespace ov { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp index 28755e69eaf589..c02f9770a37be9 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp @@ -13,10 +13,10 @@ # include #endif +#include "common.hpp" #include "openvino/core/type/bfloat16.hpp" #include "softmax.hpp" #include "softmax_kernel.hpp" -#include "common.hpp" namespace ov { namespace Extensions { @@ -39,13 +39,33 @@ void attn_softmax(void* a, if (precision == ov::element::f16) { auto _a = reinterpret_cast(a); auto _alibi = reinterpret_cast(alibi); - attn_softmax_kernel(_a, a_dst, scale, _alibi, attn_mask, causal_mask, select_nfltmax_at_0, len, total_size, attn_mask_prec, dst_precision); + attn_softmax_kernel(_a, + a_dst, + scale, + _alibi, + attn_mask, + causal_mask, + select_nfltmax_at_0, + len, + total_size, + attn_mask_prec, + dst_precision); return; } #endif auto _a = reinterpret_cast(a); auto _alibi = reinterpret_cast(alibi); - attn_softmax_kernel(_a, a_dst, scale, _alibi, attn_mask, causal_mask, select_nfltmax_at_0, len, total_size, attn_mask_prec, dst_precision); + attn_softmax_kernel(_a, + a_dst, + scale, + _alibi, + attn_mask, + causal_mask, + select_nfltmax_at_0, + len, + total_size, + attn_mask_prec, + dst_precision); } } // namespace XARCH diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp index ee264924e8f256..d620a01e221788 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp @@ -6,8 +6,8 @@ #include #include #include -#include #include +#include namespace ov { namespace Extensions { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp index 60c6a24ec5f2fa..35aab5b59c7d0e 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp @@ -3,16 +3,19 @@ // #pragma once -#include "common.hpp" -#include "openvino/core/type/element_type.hpp" - #include #include #include #include +#include "common.hpp" +#include "openvino/core/type/element_type.hpp" + #if defined(OPENVINO_ARCH_ARM64) -#include "arm_neon.h" +# if defined(HAVE_SVE) +# include "arm_sve.h" +# endif +# include "arm_neon.h" #endif namespace ov { @@ -22,7 +25,7 @@ namespace XARCH { #if defined(HAVE_AVX2) inline void exp_ps_avx2(__m256& src) { -#define REPEAT8(x) x, x, x, x, x, x, x, x +# define REPEAT8(x) x, x, x, x, x, x, x, x static const uint32_t c_min[] = {REPEAT8(0xc2aeac50)}; static const uint32_t c_max[] = {REPEAT8(0x42b17218)}; static const uint32_t c_e[] = {REPEAT8(0x3fb8aa3b)}; @@ -36,21 +39,21 @@ inline void exp_ps_avx2(__m256& src) { static const uint32_t c_p4[] = {REPEAT8(0x3d2b9d0d)}; static const uint32_t c_p5[] = {REPEAT8(0x3c07cfce)}; static const uint32_t c_2[] = {REPEAT8(0x40000000)}; -#undef REPEAT8 +# undef REPEAT8 static constexpr int n_mantissa_bits = 23; - __m256 exp_ln_flt_min_f = _mm256_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) - __m256 exp_ln_flt_max_f = _mm256_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) - __m256 exp_log2ef = _mm256_loadu_ps(reinterpret_cast(c_e)); // log2(e) - __m256 half = _mm256_loadu_ps(reinterpret_cast(c_half)); // 0.5f - __m256 ln2f = _mm256_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) - __m256 one = _mm256_loadu_ps(reinterpret_cast(c_1)); // 1.0f - __m256i exponent_bias = _mm256_loadu_si256(reinterpret_cast(c_bias));// 127 - __m256 exp_pol1 = _mm256_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f - __m256 exp_pol2 = _mm256_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f - __m256 exp_pol3 = _mm256_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f - __m256 exp_pol4 = _mm256_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f - __m256 exp_pol5 = _mm256_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f - __m256 two = _mm256_loadu_ps(reinterpret_cast(c_2)); // 2 + __m256 exp_ln_flt_min_f = _mm256_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) + __m256 exp_ln_flt_max_f = _mm256_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) + __m256 exp_log2ef = _mm256_loadu_ps(reinterpret_cast(c_e)); // log2(e) + __m256 half = _mm256_loadu_ps(reinterpret_cast(c_half)); // 0.5f + __m256 ln2f = _mm256_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) + __m256 one = _mm256_loadu_ps(reinterpret_cast(c_1)); // 1.0f + __m256i exponent_bias = _mm256_loadu_si256(reinterpret_cast(c_bias)); // 127 + __m256 exp_pol1 = _mm256_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f + __m256 exp_pol2 = _mm256_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f + __m256 exp_pol3 = _mm256_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f + __m256 exp_pol4 = _mm256_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f + __m256 exp_pol5 = _mm256_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f + __m256 two = _mm256_loadu_ps(reinterpret_cast(c_2)); // 2 // exp(x) = // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression @@ -195,32 +198,33 @@ inline void scale_add2_reduce_max(float* a, // process vector body // unroll to avoid dependency caused by _mm256_max_ps for (; i + 4 * vec_len_f32_avx512 <= size; i += 4 * vec_len_f32_avx512) { - #define ITEM(n) \ - v_a = _mm512_loadu_ps(a + i + n * vec_len_f32_avx512); \ - v_a = _mm512_mul_ps(v_a, v_scale); \ - if (has_alibi) { \ - auto v_lookup = _mm512_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx512); \ - v_a = _mm512_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ - } \ - if (has_attn_mask) { \ - auto v_mask = mm512_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx512); \ - v_a = _mm512_add_ps(v_a, v_mask); \ - } \ - if (has_causal_mask) { \ - auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx512)); \ - auto v_maski32 = _mm512_cvtepi8_epi32(v_maski8); \ - auto kmask = _mm512_cmp_epi32_mask(v_maski32, v_zeroi32, _MM_CMPINT_NE); \ - kmask = _kxor_mask16(kmask, kmask_xor); \ - v_a = _mm512_mask_blend_ps(kmask, v_a, v_nfltmax); \ - } \ - v_max##n = _mm512_max_ps(v_max##n, v_a); \ +# define ITEM(n) \ + v_a = _mm512_loadu_ps(a + i + n * vec_len_f32_avx512); \ + v_a = _mm512_mul_ps(v_a, v_scale); \ + if (has_alibi) { \ + auto v_lookup = _mm512_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx512); \ + v_a = _mm512_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ + } \ + if (has_attn_mask) { \ + auto v_mask = mm512_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx512); \ + v_a = _mm512_add_ps(v_a, v_mask); \ + } \ + if (has_causal_mask) { \ + auto v_maski8 = \ + _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx512)); \ + auto v_maski32 = _mm512_cvtepi8_epi32(v_maski8); \ + auto kmask = _mm512_cmp_epi32_mask(v_maski32, v_zeroi32, _MM_CMPINT_NE); \ + kmask = _kxor_mask16(kmask, kmask_xor); \ + v_a = _mm512_mask_blend_ps(kmask, v_a, v_nfltmax); \ + } \ + v_max##n = _mm512_max_ps(v_max##n, v_a); \ _mm512_storeu_ps(a + i + n * vec_len_f32_avx512, v_a); ITEM(0); ITEM(1); ITEM(2); ITEM(3); - #undef ITEM +# undef ITEM } while (i + vec_len_f32_avx512 <= size) { v_a = _mm512_loadu_ps(a + i); @@ -295,32 +299,32 @@ inline void scale_add2_reduce_max(float* a, // process vector body // unroll to avoid dependency caused by _mm512_max_ps for (; i + 4 * vec_len_f32_avx2 <= size; i += 4 * vec_len_f32_avx2) { - #define ITEM(n) \ - v_a = _mm256_loadu_ps(a + i + n * vec_len_f32_avx2); \ - v_a = _mm256_mul_ps(v_a, v_scale); \ - if (has_alibi) { \ - auto v_lookup = _mm256_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx2); \ - v_a = _mm256_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ - } \ - if (has_attn_mask) { \ - auto v_mask = mm256_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx2); \ - v_a = _mm256_add_ps(v_a, v_mask); \ - } \ - if (has_causal_mask) { \ - auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx2)); \ - auto v_maski32 = _mm256_cvtepi8_epi32(v_maski8); \ - v_maski32 = _mm256_cmpeq_epi32(v_maski32, v_zeroi32);\ - v_maski32 = _mm256_xor_si256(v_maski32, v_mask_xor);\ - v_a = _mm256_blendv_ps(v_nfltmax, v_a, _mm256_castsi256_ps(v_maski32)); \ - } \ - v_max##n = _mm256_max_ps(v_max##n, v_a); \ +# define ITEM(n) \ + v_a = _mm256_loadu_ps(a + i + n * vec_len_f32_avx2); \ + v_a = _mm256_mul_ps(v_a, v_scale); \ + if (has_alibi) { \ + auto v_lookup = _mm256_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx2); \ + v_a = _mm256_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ + } \ + if (has_attn_mask) { \ + auto v_mask = mm256_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx2); \ + v_a = _mm256_add_ps(v_a, v_mask); \ + } \ + if (has_causal_mask) { \ + auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx2)); \ + auto v_maski32 = _mm256_cvtepi8_epi32(v_maski8); \ + v_maski32 = _mm256_cmpeq_epi32(v_maski32, v_zeroi32); \ + v_maski32 = _mm256_xor_si256(v_maski32, v_mask_xor); \ + v_a = _mm256_blendv_ps(v_nfltmax, v_a, _mm256_castsi256_ps(v_maski32)); \ + } \ + v_max##n = _mm256_max_ps(v_max##n, v_a); \ _mm256_storeu_ps(a + i + n * vec_len_f32_avx2, v_a); ITEM(0); ITEM(1); ITEM(2); ITEM(3); - #undef ITEM +# undef ITEM } while (i + vec_len_f32_avx2 <= size) { @@ -415,7 +419,7 @@ inline void scale_add2_reduce_max(float* a, uint32x4_t v_maski32[2] = {v_maski32_low, v_maski32_high}; for (int j = 0; j < 2; ++j) { uint32x4_t kmask = vceqq_u32(v_maski32[j], v_zeroi32); // ==0 - v_a = vbslq_f32(kmask, v_nfltmax, v_a); // mask => -FLT_MAX + v_a = vbslq_f32(kmask, v_nfltmax, v_a); // mask => -FLT_MAX } } @@ -521,7 +525,7 @@ inline void scale_add2_reduce_max(ov::float16* a, #if defined(HAVE_AVX512F) static inline void exp_ps_avx512(__m512& src) { -#define REPEAT16(x) x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x +# define REPEAT16(x) x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x static const uint32_t c_min[] = {REPEAT16(0xc2aeac50)}; static const uint32_t c_max[] = {REPEAT16(0x42b17218)}; static const uint32_t c_e[] = {REPEAT16(0x3fb8aa3b)}; @@ -535,21 +539,21 @@ static inline void exp_ps_avx512(__m512& src) { static const uint32_t c_p4[] = {REPEAT16(0x3d2b9d0d)}; static const uint32_t c_p5[] = {REPEAT16(0x3c07cfce)}; static const uint32_t c_2[] = {REPEAT16(0x40000000)}; -#undef REPEAT16 +# undef REPEAT16 static constexpr int n_mantissa_bits = 23; - __m512 exp_ln_flt_min_f = _mm512_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) - __m512 exp_ln_flt_max_f = _mm512_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) - __m512 exp_log2ef = _mm512_loadu_ps(reinterpret_cast(c_e)); // log2(e) - __m512 half = _mm512_loadu_ps(reinterpret_cast(c_half)); // 0.5f - __m512 ln2f = _mm512_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) - __m512 one = _mm512_loadu_ps(reinterpret_cast(c_1)); // 1.0f - __m512i exponent_bias = _mm512_loadu_si512(c_bias); // 127 - __m512 exp_pol1 = _mm512_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f - __m512 exp_pol2 = _mm512_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f - __m512 exp_pol3 = _mm512_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f - __m512 exp_pol4 = _mm512_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f - __m512 exp_pol5 = _mm512_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f - __m512 two = _mm512_loadu_ps(reinterpret_cast(c_2)); // 2 + __m512 exp_ln_flt_min_f = _mm512_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) + __m512 exp_ln_flt_max_f = _mm512_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) + __m512 exp_log2ef = _mm512_loadu_ps(reinterpret_cast(c_e)); // log2(e) + __m512 half = _mm512_loadu_ps(reinterpret_cast(c_half)); // 0.5f + __m512 ln2f = _mm512_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) + __m512 one = _mm512_loadu_ps(reinterpret_cast(c_1)); // 1.0f + __m512i exponent_bias = _mm512_loadu_si512(c_bias); // 127 + __m512 exp_pol1 = _mm512_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f + __m512 exp_pol2 = _mm512_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f + __m512 exp_pol3 = _mm512_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f + __m512 exp_pol4 = _mm512_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f + __m512 exp_pol5 = _mm512_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f + __m512 two = _mm512_loadu_ps(reinterpret_cast(c_2)); // 2 // exp(x) = // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression @@ -656,6 +660,28 @@ inline void exp_reduce_sum(float* a, const float max, const size_t size, float& hsum(v_sum); sum = _mm256_cvtss_f32(v_sum); #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + svfloat32_t v_a; + svfloat32_t v_max = svdup_n_f32(max); + svfloat32_t v_sum = svdup_n_f32(0.0f); + size_t vec_len_f32_sve = svcntw(); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < size) { + if (size - i < vec_len_f32_sve) { + inc = size - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + v_a = svld1_f32(pg, a + i); + v_a = svsub_f32_z(pg, v_a, v_max); + v_a = exp_ps_sve(pg, v_a); + v_sum = svadd_f32_m(pg, v_sum, v_a); + svst1_f32(pg, a + i, v_a); + i += inc; + } + sum = svaddv_f32(svptrue_b32(), v_sum); +# else float32x4_t v_a; float32x4_t v_max = vdupq_n_f32(max); float32x4_t v_sum = vdupq_n_f32(0.0f); @@ -669,7 +695,7 @@ inline void exp_reduce_sum(float* a, const float max, const size_t size, float& i += vec_len_f32_neon; } sum = vaddvq_f32(v_sum); - +# endif #endif for (; i < size; i++) { a[i] = exp(a[i] - max); @@ -780,6 +806,22 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ i += (size - i); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + svfloat32_t v_scale = svdup_n_f32(val); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < size) { + if (size - i < vec_len_f32_sve) { + inc = size - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + svfloat32_t v_a = svld1_f32(pg, a + i); + v_a = svmul_f32_z(pg, v_a, v_scale); + svst1_f32(pg, a_dst + i, v_a); + i += inc; + } +# else float32x4_t v_scale = vdupq_n_f32(val); while (i + vec_len_f32_neon <= size) { float32x4_t v_a = vld1q_f32(a + i); @@ -787,13 +829,16 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ vst1q_f32(a_dst + i, v_a); i += vec_len_f32_neon; } +# endif #endif for (; i < size; i++) { a_dst[i] = a[i] * val; } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> inline void multiply_scalar(float* a, T* a_dst, const float val, const size_t size) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -899,47 +944,68 @@ inline void attn_softmax_kernel(float* a, ov::element::Type attn_mask_prec, ov::element::Type dst_precision, float alibi_slope) { - using func_fp32_type = void (*)(float*, float, const float*, const float*, const uint8_t*, bool, size_t, float, float&); - using func_bf16_type = void (*)(float*, float, const float*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, float&); - using func_f16_type = void (*)(float*, float, const float*, const ov::float16*, const uint8_t*, bool, size_t, float, float&); - static constexpr func_fp32_type funcs_fp32[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_bf16_type funcs_bf16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_f16_type funcs_f16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; + using func_fp32_type = + void (*)(float*, float, const float*, const float*, const uint8_t*, bool, size_t, float, float&); + using func_bf16_type = + void (*)(float*, float, const float*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, float&); + using func_f16_type = + void (*)(float*, float, const float*, const ov::float16*, const uint8_t*, bool, size_t, float, float&); + static constexpr func_fp32_type funcs_fp32[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_bf16_type funcs_bf16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_f16_type funcs_f16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; int dispatch = (alibi ? 0b100 : 0) | (attn_mask ? 0b010 : 0) | (causal_mask ? 0b001 : 0); float max = std::numeric_limits::lowest(); if (attn_mask_prec == ov::element::f32) { - funcs_fp32[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_fp32[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else if (attn_mask_prec == ov::element::bf16) { - funcs_bf16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_bf16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else { - funcs_f16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_f16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } float sum = 0.0f; @@ -948,7 +1014,7 @@ inline void attn_softmax_kernel(float* a, // divide sum float scalar = 1.0f / sum; if (dst_precision == ov::element::f32) { - multiply_scalar(a, static_cast(a_dst), scalar, len); + multiply_scalar(a, reinterpret_cast(a_dst), scalar, len); // apply causual mask to final result instead of attn_score if (total_size > len) memset(static_cast(a_dst) + len, 0, sizeof(float) * (total_size - len)); @@ -978,47 +1044,89 @@ inline void attn_softmax_kernel(ov::float16* a, ov::element::Type attn_mask_prec, ov::element::Type dst_precision, float alibi_slope) { - using func_fp32_type = void (*)(ov::float16*, float, const ov::float16*, const float*, const uint8_t*, bool, size_t, float, ov::float16&); - using func_bf16_type = void (*)(ov::float16*, float, const ov::float16*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, ov::float16&); - using func_fp16_type = void (*)(ov::float16*, float, const ov::float16*, const ov::float16*, const uint8_t*, bool, size_t, float, ov::float16&); - static constexpr func_fp32_type funcs_fp32[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_bf16_type funcs_bf16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_fp16_type funcs_fp16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; + using func_fp32_type = void (*)(ov::float16*, + float, + const ov::float16*, + const float*, + const uint8_t*, + bool, + size_t, + float, + ov::float16&); + using func_bf16_type = void (*)(ov::float16*, + float, + const ov::float16*, + const ov::bfloat16*, + const uint8_t*, + bool, + size_t, + float, + ov::float16&); + using func_fp16_type = void (*)(ov::float16*, + float, + const ov::float16*, + const ov::float16*, + const uint8_t*, + bool, + size_t, + float, + ov::float16&); + static constexpr func_fp32_type funcs_fp32[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_bf16_type funcs_bf16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_fp16_type funcs_fp16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; int dispatch = (alibi ? 0b100 : 0) | (attn_mask ? 0b010 : 0) | (causal_mask ? 0b001 : 0); ov::float16 max = std::numeric_limits::lowest(); if (attn_mask_prec == ov::element::f32) { - funcs_fp32[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_fp32[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else if (attn_mask_prec == ov::element::f16) { - funcs_fp16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_fp16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else { - funcs_bf16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_bf16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } ov::float16 sum = 0.0f; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp index b719246e4976a1..93d7db55107951 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp @@ -3,96 +3,108 @@ // #pragma once -#include "common.hpp" -#include "openvino/core/type/element_type.hpp" - #include #include #include #include +#include "common.hpp" +#include "openvino/core/type/element_type.hpp" + namespace ov { namespace Extensions { namespace Cpu { namespace XARCH { #if defined(HAVE_AVX512F) -inline void transpose_m512i_16x16(__m512i& r0, __m512i& r1, __m512i& r2, __m512i& r3, - __m512i& r4, __m512i& r5, __m512i& r6, __m512i& r7, - __m512i& r8, __m512i& r9, __m512i& ra, __m512i& rb, - __m512i& rc, __m512i& rd, __m512i& re, __m512i& rf) { +inline void transpose_m512i_16x16(__m512i& r0, + __m512i& r1, + __m512i& r2, + __m512i& r3, + __m512i& r4, + __m512i& r5, + __m512i& r6, + __m512i& r7, + __m512i& r8, + __m512i& r9, + __m512i& ra, + __m512i& rb, + __m512i& rc, + __m512i& rd, + __m512i& re, + __m512i& rf) { __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; - t0 = _mm512_unpacklo_epi32(r0, r1); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t1 = _mm512_unpackhi_epi32(r0, r1); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t2 = _mm512_unpacklo_epi32(r2, r3); // 32 48 33 49 ... - t3 = _mm512_unpackhi_epi32(r2, r3); // 34 50 35 51 ... - t4 = _mm512_unpacklo_epi32(r4, r5); // 64 80 65 81 ... - t5 = _mm512_unpackhi_epi32(r4, r5); // 66 82 67 83 ... - t6 = _mm512_unpacklo_epi32(r6, r7); // 96 112 97 113 ... - t7 = _mm512_unpackhi_epi32(r6, r7); // 98 114 99 115 ... - t8 = _mm512_unpacklo_epi32(r8, r9); // 128 ... - t9 = _mm512_unpackhi_epi32(r8, r9); // 130 ... - ta = _mm512_unpacklo_epi32(ra, rb); // 160 ... - tb = _mm512_unpackhi_epi32(ra, rb); // 162 ... - tc = _mm512_unpacklo_epi32(rc, rd); // 196 ... - td = _mm512_unpackhi_epi32(rc, rd); // 198 ... - te = _mm512_unpacklo_epi32(re, rf); // 228 ... - tf = _mm512_unpackhi_epi32(re, rf); // 230 ... - - r0 = _mm512_unpacklo_epi64(t0, t2); // 0 16 32 48 ... - r1 = _mm512_unpackhi_epi64(t0, t2); // 1 17 33 49 ... - r2 = _mm512_unpacklo_epi64(t1, t3); // 2 18 34 49 ... - r3 = _mm512_unpackhi_epi64(t1, t3); // 3 19 35 51 ... - r4 = _mm512_unpacklo_epi64(t4, t6); // 64 80 96 112 ... - r5 = _mm512_unpackhi_epi64(t4, t6); // 65 81 97 114 ... - r6 = _mm512_unpacklo_epi64(t5, t7); // 66 82 98 113 ... - r7 = _mm512_unpackhi_epi64(t5, t7); // 67 83 99 115 ... - r8 = _mm512_unpacklo_epi64(t8, ta); // 128 144 160 176 ... - r9 = _mm512_unpackhi_epi64(t8, ta); // 129 145 161 178 ... - ra = _mm512_unpacklo_epi64(t9, tb); // 130 146 162 177 ... - rb = _mm512_unpackhi_epi64(t9, tb); // 131 147 163 179 ... - rc = _mm512_unpacklo_epi64(tc, te); // 192 208 228 240 ... - rd = _mm512_unpackhi_epi64(tc, te); // 193 209 229 241 ... - re = _mm512_unpacklo_epi64(td, tf); // 194 210 230 242 ... - rf = _mm512_unpackhi_epi64(td, tf); // 195 211 231 243 ... - - t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); // 0 16 32 48 8 24 40 56 64 80 96 112 ... - t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); // 1 17 33 49 ... - t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); // 2 18 34 50 ... - t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); // 3 19 35 51 ... - t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); // 4 20 36 52 ... - t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); // 5 21 37 53 ... - t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); // 6 22 38 54 ... - t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); // 7 23 39 55 ... - t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); // 128 144 160 176 ... - t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); // 129 145 161 177 ... - ta = _mm512_shuffle_i32x4(ra, re, 0x88); // 130 146 162 178 ... - tb = _mm512_shuffle_i32x4(rb, rf, 0x88); // 131 147 163 179 ... - tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); // 132 148 164 180 ... - td = _mm512_shuffle_i32x4(r9, rd, 0xdd); // 133 149 165 181 ... - te = _mm512_shuffle_i32x4(ra, re, 0xdd); // 134 150 166 182 ... - tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); // 135 151 167 183 ... - - r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); // 0 16 32 48 64 80 96 112 ... 240 - r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); // 1 17 33 49 66 81 97 113 ... 241 - r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); // 2 18 34 50 67 82 98 114 ... 242 - r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); // 3 19 35 51 68 83 99 115 ... 243 - r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); // 4 ... - r5 = _mm512_shuffle_i32x4(t5, td, 0x88); // 5 ... - r6 = _mm512_shuffle_i32x4(t6, te, 0x88); // 6 ... - r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); // 7 ... - r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); // 8 ... - r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); // 9 ... - ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); // 10 ... - rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); // 11 ... - rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); // 12 ... - rd = _mm512_shuffle_i32x4(t5, td, 0xdd); // 13 ... - re = _mm512_shuffle_i32x4(t6, te, 0xdd); // 14 ... - rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); // 15 31 47 63 79 96 111 127 ... 255 + t0 = _mm512_unpacklo_epi32(r0, r1); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 + t1 = _mm512_unpackhi_epi32(r0, r1); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 + t2 = _mm512_unpacklo_epi32(r2, r3); // 32 48 33 49 ... + t3 = _mm512_unpackhi_epi32(r2, r3); // 34 50 35 51 ... + t4 = _mm512_unpacklo_epi32(r4, r5); // 64 80 65 81 ... + t5 = _mm512_unpackhi_epi32(r4, r5); // 66 82 67 83 ... + t6 = _mm512_unpacklo_epi32(r6, r7); // 96 112 97 113 ... + t7 = _mm512_unpackhi_epi32(r6, r7); // 98 114 99 115 ... + t8 = _mm512_unpacklo_epi32(r8, r9); // 128 ... + t9 = _mm512_unpackhi_epi32(r8, r9); // 130 ... + ta = _mm512_unpacklo_epi32(ra, rb); // 160 ... + tb = _mm512_unpackhi_epi32(ra, rb); // 162 ... + tc = _mm512_unpacklo_epi32(rc, rd); // 196 ... + td = _mm512_unpackhi_epi32(rc, rd); // 198 ... + te = _mm512_unpacklo_epi32(re, rf); // 228 ... + tf = _mm512_unpackhi_epi32(re, rf); // 230 ... + + r0 = _mm512_unpacklo_epi64(t0, t2); // 0 16 32 48 ... + r1 = _mm512_unpackhi_epi64(t0, t2); // 1 17 33 49 ... + r2 = _mm512_unpacklo_epi64(t1, t3); // 2 18 34 49 ... + r3 = _mm512_unpackhi_epi64(t1, t3); // 3 19 35 51 ... + r4 = _mm512_unpacklo_epi64(t4, t6); // 64 80 96 112 ... + r5 = _mm512_unpackhi_epi64(t4, t6); // 65 81 97 114 ... + r6 = _mm512_unpacklo_epi64(t5, t7); // 66 82 98 113 ... + r7 = _mm512_unpackhi_epi64(t5, t7); // 67 83 99 115 ... + r8 = _mm512_unpacklo_epi64(t8, ta); // 128 144 160 176 ... + r9 = _mm512_unpackhi_epi64(t8, ta); // 129 145 161 178 ... + ra = _mm512_unpacklo_epi64(t9, tb); // 130 146 162 177 ... + rb = _mm512_unpackhi_epi64(t9, tb); // 131 147 163 179 ... + rc = _mm512_unpacklo_epi64(tc, te); // 192 208 228 240 ... + rd = _mm512_unpackhi_epi64(tc, te); // 193 209 229 241 ... + re = _mm512_unpacklo_epi64(td, tf); // 194 210 230 242 ... + rf = _mm512_unpackhi_epi64(td, tf); // 195 211 231 243 ... + + t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); // 0 16 32 48 8 24 40 56 64 80 96 112 ... + t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); // 1 17 33 49 ... + t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); // 2 18 34 50 ... + t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); // 3 19 35 51 ... + t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); // 4 20 36 52 ... + t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); // 5 21 37 53 ... + t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); // 6 22 38 54 ... + t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); // 7 23 39 55 ... + t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); // 128 144 160 176 ... + t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); // 129 145 161 177 ... + ta = _mm512_shuffle_i32x4(ra, re, 0x88); // 130 146 162 178 ... + tb = _mm512_shuffle_i32x4(rb, rf, 0x88); // 131 147 163 179 ... + tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); // 132 148 164 180 ... + td = _mm512_shuffle_i32x4(r9, rd, 0xdd); // 133 149 165 181 ... + te = _mm512_shuffle_i32x4(ra, re, 0xdd); // 134 150 166 182 ... + tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); // 135 151 167 183 ... + + r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); // 0 16 32 48 64 80 96 112 ... 240 + r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); // 1 17 33 49 66 81 97 113 ... 241 + r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); // 2 18 34 50 67 82 98 114 ... 242 + r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); // 3 19 35 51 68 83 99 115 ... 243 + r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); // 4 ... + r5 = _mm512_shuffle_i32x4(t5, td, 0x88); // 5 ... + r6 = _mm512_shuffle_i32x4(t6, te, 0x88); // 6 ... + r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); // 7 ... + r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); // 8 ... + r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); // 9 ... + ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); // 10 ... + rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); // 11 ... + rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); // 12 ... + rd = _mm512_shuffle_i32x4(t5, td, 0xdd); // 13 ... + re = _mm512_shuffle_i32x4(t6, te, 0xdd); // 14 ... + rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); // 15 31 47 63 79 96 111 127 ... 255 } -template +template inline void transpose_16x16_kernel(float* _dst, T* src, size_t dst_stride, size_t src_stride) { auto* dst = reinterpret_cast(_dst); __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; @@ -133,7 +145,7 @@ inline void transpose_16x16_kernel(float* _dst, T* src, size_t dst_stride, size_ _mm512_storeu_si512(dst + 15 * dst_stride, rf); } -template +template inline void transpose_16xK_kernel(float* _dst, T* src, size_t K, size_t dst_stride, size_t src_stride) { auto* dst = reinterpret_cast(_dst); __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; @@ -156,24 +168,110 @@ inline void transpose_16xK_kernel(float* _dst, T* src, size_t K, size_t dst_stri transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf); -#define S(m) _mm512_storeu_si512(dst + 0x##m * dst_stride, r##m) -#define S8() S(0); S(1); S(2); S(3); S(4); S(5); S(6); S(7); +# define S(m) _mm512_storeu_si512(dst + 0x##m * dst_stride, r##m) +# define S8() \ + S(0); \ + S(1); \ + S(2); \ + S(3); \ + S(4); \ + S(5); \ + S(6); \ + S(7); switch (K) { - case 8: S8(); break; - case 9: S8() S(8); break; - case 10: S8(); S(8); S(9); break; - case 11: S8(); S(8); S(9); S(a); break; - case 12: S8(); S(8); S(9); S(a); S(b); break; - case 13: S8(); S(8); S(9); S(a); S(b); S(c); break; - case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break; - case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break; - case 1: S(0); break; - case 2: S(0); S(1); break; - case 3: S(0); S(1); S(2); break; - case 4: S(0); S(1); S(2); S(3); break; - case 5: S(0); S(1); S(2); S(3); S(4); break; - case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; - case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + case 8: + S8(); + break; + case 9: + S8() S(8); + break; + case 10: + S8(); + S(8); + S(9); + break; + case 11: + S8(); + S(8); + S(9); + S(a); + break; + case 12: + S8(); + S(8); + S(9); + S(a); + S(b); + break; + case 13: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + break; + case 14: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + break; + case 15: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + S(e); + break; + case 1: + S(0); + break; + case 2: + S(0); + S(1); + break; + case 3: + S(0); + S(1); + S(2); + break; + case 4: + S(0); + S(1); + S(2); + S(3); + break; + case 5: + S(0); + S(1); + S(2); + S(3); + S(4); + break; + case 6: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + break; + case 7: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + S(6); + break; } } @@ -240,30 +338,109 @@ inline void transpose_16xK_kernel(uint32_t* dst, uint32_t* src, size_t K, size_t transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf); switch (K) { - case 8: S8(); break; - case 9: S8() S(8); break; - case 10: S8(); S(8); S(9); break; - case 11: S8(); S(8); S(9); S(a); break; - case 12: S8(); S(8); S(9); S(a); S(b); break; - case 13: S8(); S(8); S(9); S(a); S(b); S(c); break; - case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break; - case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break; - case 1: S(0); break; - case 2: S(0); S(1); break; - case 3: S(0); S(1); S(2); break; - case 4: S(0); S(1); S(2); S(3); break; - case 5: S(0); S(1); S(2); S(3); S(4); break; - case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; - case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + case 8: + S8(); + break; + case 9: + S8() S(8); + break; + case 10: + S8(); + S(8); + S(9); + break; + case 11: + S8(); + S(8); + S(9); + S(a); + break; + case 12: + S8(); + S(8); + S(9); + S(a); + S(b); + break; + case 13: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + break; + case 14: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + break; + case 15: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + S(e); + break; + case 1: + S(0); + break; + case 2: + S(0); + S(1); + break; + case 3: + S(0); + S(1); + S(2); + break; + case 4: + S(0); + S(1); + S(2); + S(3); + break; + case 5: + S(0); + S(1); + S(2); + S(3); + S(4); + break; + case 6: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + break; + case 7: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + S(6); + break; } -#undef S -#undef S8 +# undef S +# undef S8 } #elif defined(HAVE_AVX2) // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2 -inline void transpose_8x8(__m256& r0, __m256& r1, __m256& r2, __m256& r3, __m256& r4, __m256& r5, __m256& r6, __m256& r7) { +inline void +transpose_8x8(__m256& r0, __m256& r1, __m256& r2, __m256& r3, __m256& r4, __m256& r5, __m256& r6, __m256& r7) { __m256 t0, t1, t2, t3, t4, t5, t6, t7; __m256 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; t0 = _mm256_unpacklo_ps(r0, r1); @@ -292,7 +469,7 @@ inline void transpose_8x8(__m256& r0, __m256& r1, __m256& r2, __m256& r3, __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); } -template +template inline void transpose_16x16_kernel(float* dst, T* src, size_t dst_stride, size_t src_stride) { __m256 r0, r1, r2, r3, r4, r5, r6, r7; @@ -323,7 +500,7 @@ inline void transpose_16x16_kernel(float* dst, T* src, size_t dst_stride, size_t } } -template +template inline void transpose_16xK_kernel(float* dst, T* src, size_t K, size_t dst_stride, size_t src_stride) { __m256 r0, r1, r2, r3, r4, r5, r6, r7; @@ -366,24 +543,59 @@ inline void transpose_16xK_kernel(float* dst, T* src, size_t K, size_t dst_strid transpose_8x8(r0, r1, r2, r3, r4, r5, r6, r7); -#define S(m) _mm256_storeu_ps(dst + j + m * dst_stride, r##m) +# define S(m) _mm256_storeu_ps(dst + j + m * dst_stride, r##m) switch (K) { - case 1: S(0); break; - case 2: S(0); S(1); break; - case 3: S(0); S(1); S(2); break; - case 4: S(0); S(1); S(2); S(3); break; - case 5: S(0); S(1); S(2); S(3); S(4); break; - case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; - case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + case 1: + S(0); + break; + case 2: + S(0); + S(1); + break; + case 3: + S(0); + S(1); + S(2); + break; + case 4: + S(0); + S(1); + S(2); + S(3); + break; + case 5: + S(0); + S(1); + S(2); + S(3); + S(4); + break; + case 6: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + break; + case 7: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + S(6); + break; } -#undef S +# undef S } } } #else -template +template inline void transpose_16x16_kernel(TDST* dst, TSRC* src, size_t dst_stride, size_t src_stride) { for (size_t i = 0; i < 16; i++) { for (size_t j = 0; j < 16; j++) { @@ -392,7 +604,7 @@ inline void transpose_16x16_kernel(TDST* dst, TSRC* src, size_t dst_stride, size } } -template +template inline void transpose_16xK_kernel(TDST* dst, TSRC* src, size_t K, size_t dst_stride, size_t src_stride) { for (size_t i = 0; i < K; i++) { for (size_t j = 0; j < 16; j++) { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp index 2895a272b982b5..7df2e2371a843a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp @@ -4,11 +4,12 @@ #include "brgemm_kernel.hpp" -#include "dnnl_extension_utils.h" -#include "utils/cpu_utils.hpp" #include #include +#include "dnnl_extension_utils.h" +#include "utils/cpu_utils.hpp" + using namespace dnnl::impl::cpu::x64; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64::matmul; @@ -100,8 +101,9 @@ BrgemmKernel::BrgemmKernel(size_t M, brgemmCtx.M = M_; brgemmCtx.N = N_; brgemmCtx.K = K_; - brgemmCtx.LDA = k ? K_blk : (is_avx_f16_only ? K : lda); // f16 use f32 internally - brgemmCtx.LDB = (!is_f32 || b_transposed) ? rnd_up(N, N_blk) : ldb; // bf16/fp16/b_transposed needs copy + brgemmCtx.LDA = k ? K_blk : (is_avx_f16_only ? K : lda); // f16 use f32 internally + brgemmCtx.LDB = + (!is_f32 || b_transposed) ? rnd_up(N, N_blk) : ldb; // bf16/fp16/b_transposed needs copy brgemmCtx.LDC = ldc; brgemmCtx.dt_in0 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(srcType)); brgemmCtx.dt_in1 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(weiType)); @@ -158,8 +160,8 @@ const size_t BrgemmKernel::get_scratch_b_size() const { } void BrgemmKernel::init_brgemm(brgemmCtx& ctx, - std::unique_ptr& brgKernel, - bool use_amx) { + std::unique_ptr& brgKernel, + bool use_amx) { brgemm_desc_t brgDesc; const bool is_int8 = @@ -208,7 +210,8 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx, brgattr.max_bs = 1; brgattr.wary_tail_read = false; brgattr.hint_innermost_loop = brgemm_innermost_undef; - // if b_accumulate is true, it means we want c+=a*b. jit_brgemm_amx_uker_base_t::load_accumulators can support this using tileload(c) without postops + // if b_accumulate is true, it means we want c+=a*b. jit_brgemm_amx_uker_base_t::load_accumulators can support + // this using tileload(c) without postops brgattr.use_uker = true; brgattr.use_interleave_stores = true; brgattr.hint_prefetching = brgemm_kernel_prefetching_t::brgemm_prf1; @@ -248,7 +251,7 @@ void BrgemmKernel::init_brgemm_copy_a( brgCopyKernelConf.K_tail = K_tail; brgCopyKernelConf.K_blk = K_blk; brgCopyKernelConf.use_buffer_a_tail_only = false; - //padding K tail to K_blk, LDA is the stride for target tensor + // padding K tail to K_blk, LDA is the stride for target tensor brgCopyKernelConf.LDA = LDA; brgCopyKernelConf.has_zero_point_b = false; brgCopyKernelConf.s8s8_compensation_required = false; @@ -258,9 +261,13 @@ void BrgemmKernel::init_brgemm_copy_a( brgCopyKernelConf.copy_A_src_stride = copy_A_src_stride; // copy_a_kernel assumes that in/out tensor has same data type except f16 // copy_a_kernel has special path for f16: assuming input(f16) -> output(f32) - brgCopyKernelConf.a_dt_sz = is_avx_f16_only ? sizeof(ov::float16) : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + brgCopyKernelConf.a_dt_sz = is_avx_f16_only + ? sizeof(ov::float16) + : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); // copied A has the same precision of original - brgCopyKernelConf.tr_a_dt_sz = is_avx_f16_only ? sizeof(float) : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + brgCopyKernelConf.tr_a_dt_sz = + is_avx_f16_only ? sizeof(float) + : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); brgCopyKernelConf.transposed_A = transpose; brgCopyKernelConf.isa = is_avx_f16_only ? avx512_core_fp16 : avx512_core_amx; @@ -284,7 +291,7 @@ void BrgemmKernel::init_brgemm_copy_b( brgCopyKernelConf.wei_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in1; brgCopyKernelConf.orig_wei_dt = dt_in1; brgCopyKernelConf.wei_n_blk = N_blk; - brgCopyKernelConf.wei_tag = transpose ? dnnl_ba : dnnl_ab; + brgCopyKernelConf.wei_tag = transpose ? dnnl_ba : dnnl_ab; brgCopyKernelConf.copy_B_wei_stride = copy_B_wei_stride; brgCopyKernelConf.transposed_B = transpose; @@ -298,10 +305,14 @@ void BrgemmKernel::init_brgemm_copy_b( brgCopyKernelConf.K_tail = 0; brgCopyKernelConf.N_chunk_elems = brgCopyKernelConf.N_blk; // f16 is computed by upconverting. in(f16) -> out(f32) - brgCopyKernelConf.b_dt_sz = is_avx_f16_only ? sizeof(ov::float16) : - DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); - brgCopyKernelConf.tr_b_dt_sz = is_avx_f16_only ? sizeof(float) : - DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); + brgCopyKernelConf.b_dt_sz = + is_avx_f16_only + ? sizeof(ov::float16) + : DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); + brgCopyKernelConf.tr_b_dt_sz = + is_avx_f16_only + ? sizeof(float) + : DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); brgCopyKernelConf.req_wei_vnni_downconvert = false; if (is_with_amx) { @@ -390,12 +401,7 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void* auto weight_ptr = ptr_scartch_b + B_stride; auto C_stride = n * count_N * ov::element::f32.size(); auto out_ptr = ptr_C + C_stride; - callBrgemm(brgemmCtx, - brgKernels[getBrgIdx(mIdx, k, n)], - local_a_ptr, - weight_ptr, - out_ptr, - wsp); + callBrgemm(brgemmCtx, brgKernels[getBrgIdx(mIdx, k, n)], local_a_ptr, weight_ptr, out_ptr, wsp); // stride K, N if body kernel is executed. if (k == 0) { count_K = brgemmCtx.K * brgemmCtx.LDB; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp index f8b0df611258a7..1d5e81410a0bf3 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp @@ -4,7 +4,6 @@ #include "dft_uni_kernel.hpp" - using namespace dnnl::impl; using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu::x64; @@ -16,7 +15,8 @@ namespace ov { namespace intel_cpu { template -jit_uni_dft_kernel_f32::jit_uni_dft_kernel_f32() : jit_uni_dft_kernel(), jit_generator(jit_name()) {} +jit_uni_dft_kernel_f32::jit_uni_dft_kernel_f32() : jit_uni_dft_kernel(), + jit_generator(jit_name()) {} template void jit_uni_dft_kernel_f32::create_ker() { @@ -115,11 +115,9 @@ template struct jit_uni_dft_kernel_f32; template struct jit_uni_dft_kernel_f32; template struct jit_uni_dft_kernel_f32; - template -jit_uni_fft_kernel_f32::jit_uni_fft_kernel_f32() - : jit_uni_fft_kernel(), - jit_generator(jit_name()) {} +jit_uni_fft_kernel_f32::jit_uni_fft_kernel_f32() : jit_uni_fft_kernel(), + jit_generator(jit_name()) {} template void jit_uni_fft_kernel_f32::create_ker() { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp index b70c99e5f8a527..095a3db97d2a64 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp @@ -130,7 +130,6 @@ struct jit_uni_fft_kernel_f32 : public jit_uni_fft_kernel, public dnnl::impl::cp Vmm vmm_data_result = vmm_data_odd_2; - template void loop_process(int step); @@ -138,5 +137,5 @@ struct jit_uni_fft_kernel_f32 : public jit_uni_fft_kernel, public dnnl::impl::cp void move_data(const Xbyak::Xmm& x, const Xbyak::Address& addr, int count); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp index 5aaefb086f119c..c0de6520b7099c 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp @@ -3,6 +3,7 @@ // #include "gather_uni_kernel.hpp" + #include "openvino/core/except.hpp" using namespace dnnl::impl::cpu; @@ -10,23 +11,52 @@ using namespace dnnl::impl::cpu; namespace ov { namespace intel_cpu { -const unsigned jitGatherKernelBase::shufMask8bitUni[16] = {0x0C080400, 0x80808080, 0x80808080, 0x80808080, 0x0C080400, 0x80808080, 0x80808080, 0x80808080, - 0x0C080400, 0x80808080, 0x80808080, 0x80808080, 0x0C080400, 0x80808080, 0x80808080, 0x80808080}; -const unsigned jitGatherKernelBase::permMask8bitA2[8] = {0, 4, 1, 5, 2, 6, 3, 7}; -const unsigned jitGatherKernelBase::permMask8bitA5[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; - -const unsigned jitGatherKernelBase::shufMask16bitUni[16] = {0x05040100, 0x0D0C0908, 0x80808080, 0x80808080, 0x05040100, 0x0D0C0908, 0x80808080, 0x80808080, - 0x05040100, 0x0D0C0908, 0x80808080, 0x80808080, 0x05040100, 0x0D0C0908, 0x80808080, 0x80808080}; -const unsigned jitGatherKernelBase::permMask16bitA2[8] = {0, 1, 4, 5, 2, 3, 6, 7}; -const unsigned jitGatherKernelBase::permMask16bitA5[16] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; +const unsigned jitGatherKernelBase::shufMask8bitUni[16] = {0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080, + 0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080, + 0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080, + 0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080}; +const unsigned jitGatherKernelBase::permMask8bitA2[8] = {0, 4, 1, 5, 2, 6, 3, 7}; +const unsigned jitGatherKernelBase::permMask8bitA5[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; + +const unsigned jitGatherKernelBase::shufMask16bitUni[16] = {0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080, + 0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080, + 0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080, + 0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080}; +const unsigned jitGatherKernelBase::permMask16bitA2[8] = {0, 1, 4, 5, 2, 3, 6, 7}; +const unsigned jitGatherKernelBase::permMask16bitA5[16] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; const unsigned jitGatherKernelBase::incVec[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; #define GET_OFF(field) offsetof(gatherJitExecArgs, field) template -jitUniGatherKernel::jitUniGatherKernel(const jGatherConfParams& jcp) : - jitGatherKernelBase(jcp), x64::jit_generator(jit_name()) { +jitUniGatherKernel::jitUniGatherKernel(const jGatherConfParams& jcp) + : jitGatherKernelBase(jcp), + x64::jit_generator(jit_name()) { vlen = x64::cpu_isa_traits::vlen; dataElPerVec = vlen / jcp.dataTypeSize; idxElPerVec = vlen / indicesTypeSize; @@ -74,7 +104,7 @@ void jitUniGatherKernel::generate() { if (!jcp.dynamicShapes) { mov(regAux1, ptr[regParams + GET_OFF(specIndicesSize)]); uni_vpbroadcastd(vmmSpecIdxSizeB, ptr[regAux1]); - uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. mov(regAux1, ptr[regParams + GET_OFF(specIdxB)]); uni_vmovups(vmmSpecIdxB, ptr[regAux1]); @@ -84,7 +114,7 @@ void jitUniGatherKernel::generate() { uni_vmovups(vmmSrcBeforeAxisSumB, ptr[regAux1]); } - if (jcp.afterAxisSize == 1lu) { // Elementwise case. + if (jcp.afterAxisSize == 1lu) { // Elementwise case. uni_vmovd(reg32SpecIdxSizeB, xmmSpecIdxSizeB); if (jcp.beforeAxisSize != 1lu) { mov(regAux1, ptr[regParams + GET_OFF(axisAndAfterAxisSizeB)]); @@ -98,8 +128,9 @@ void jitUniGatherKernel::generate() { mov(regBetweenBatchAndAxisSize, ptr[regAux1]); mov(regBetweenBatchAndAxisIter, ptr[regParams + GET_OFF(betweenBatchAndAxisIter)]); - if (jcp.specIdxSize < idxElPerVec) { // Short case. - if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && jcp.specIdxSize != 16) { + if (jcp.specIdxSize < idxElPerVec) { // Short case. + if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && + jcp.specIdxSize != 16) { mov(regAux1, ptr[regParams + GET_OFF(permIdxMask)]); uni_vmovups(vmmPermIdxMask, ptr[regAux1]); } @@ -107,7 +138,7 @@ void jitUniGatherKernel::generate() { mov(regAux1, ptr[regParams + GET_OFF(beforeAxisDiff)]); uni_vmovups(vmmBeforeAxDiffB, ptr[regAux1]); if (jcp.dataTypeSize != 1) - uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size + uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size } if (jcp.batchDims > 0lu) { mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); @@ -115,14 +146,14 @@ void jitUniGatherKernel::generate() { } process(true, false); - } else { // Long case. + } else { // Long case. uni_vmovd(reg32IdxIter, xmmSpecIdxB); fillVlenVector(); process(false, false); } - } else { // Blocked case. - if (jcp.afterAxisSize <= idxElPerVec) { // Short case. + } else { // Blocked case. + if (jcp.afterAxisSize <= idxElPerVec) { // Short case. mov(regAux1, ptr[regParams + GET_OFF(afterAxIdxB)]); uni_vmovups(vmmAfterAxisIdxB, ptr[regAux1]); mov(regAux1, ptr[regParams + GET_OFF(afterAxisPermMask)]); @@ -146,18 +177,19 @@ void jitUniGatherKernel::generate() { } const uint64_t specIdxAndAfterAxisSize = jcp.specIdxSize * jcp.afterAxisSize; if (specIdxAndAfterAxisSize != 1 && specIdxAndAfterAxisSize != 2 && specIdxAndAfterAxisSize != 4 && - specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) { + specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) { mov(regAux1, ptr[regParams + GET_OFF(beforeAxisPermMask)]); uni_vmovups(vmmBeforeAxPermMask, ptr[regAux1]); } } process(true, true); - } else { // Long case. - OPENVINO_THROW("Gather kernel does not support static shape with after axis size greater than elements in vector."); + } else { // Long case. + OPENVINO_THROW("Gather kernel does not support static shape with after axis size greater than elements " + "in vector."); } } - } else { // Dynamic shapes. + } else { // Dynamic shapes. mov(regAux1, ptr[regParams + GET_OFF(start)]); uni_vpbroadcastd(vmmSpecIdxB, ptr[regAux1]); mov(regAux1, reinterpret_cast(incVec)); @@ -172,8 +204,8 @@ void jitUniGatherKernel::generate() { uni_vroundps(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, 0x1); uni_vfnmadd231ps(vmmSpecIdxB, vmmSrcBeforeAxisSumB, vAux1); uni_vcvtps2dq(vmmSpecIdxB, vmmSpecIdxB); - uni_vpslld(vmmSpecIdxB, vmmSpecIdxB, idxTypeShift); // multiply by indices type size. - uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmSpecIdxB, vmmSpecIdxB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. uni_vmovd(reg32SpecIdxSizeB, xmmSpecIdxSizeB); mov(regAux1, ptr[regParams + GET_OFF(betweenBatchAndAxisSize)]); @@ -189,7 +221,8 @@ void jitUniGatherKernel::generate() { mov(regAux1, ptr[regParams + GET_OFF(axisAndAfterAxisSizeB)]); uni_vpbroadcastd(vmmAxisAndAfterAxisSizeB, ptr[regAux1]); - // Formula: srcBeforeAxisSum = ((start / specIndicesSize) % betweenBatchAndAxis) * axisAndAfterAxisSize + srcAfterBatchSize * idxBatchSum + // Formula: srcBeforeAxisSum = ((start / specIndicesSize) % betweenBatchAndAxis) * axisAndAfterAxisSize + + // srcAfterBatchSize * idxBatchSum if (jcp.beforeAxisSize != 1lu) { uni_vpmulld(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); @@ -210,28 +243,29 @@ void jitUniGatherKernel::generate() { cmp(regSpecIdxSizeB, vlen); jl(lLessThanVector1, T_NEAR); - uni_vmovd(reg32IdxIter, xmmSpecIdxB); - fillVlenVector(); + uni_vmovd(reg32IdxIter, xmmSpecIdxB); + fillVlenVector(); - process(false, false); - jmp(lE1, T_NEAR); + process(false, false); + jmp(lE1, T_NEAR); L(lLessThanVector1); - mov(regAux1, ptr[regParams + GET_OFF(permIdxMask)]); - uni_vmovups(vmmPermIdxMask, ptr[regAux1]); - if (jcp.beforeAxisSize != 1lu) { - mov(regAux1, ptr[regParams + GET_OFF(beforeAxisDiff)]); - uni_vmovups(vmmBeforeAxDiffB, ptr[regAux1]); - if (jcp.dataTypeSize != 1) - uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size - } - mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); - uni_vpbroadcastd(vmmSrcAfterBatchSizeB, ptr[regAux1]); + mov(regAux1, ptr[regParams + GET_OFF(permIdxMask)]); + uni_vmovups(vmmPermIdxMask, ptr[regAux1]); + if (jcp.beforeAxisSize != 1lu) { + mov(regAux1, ptr[regParams + GET_OFF(beforeAxisDiff)]); + uni_vmovups(vmmBeforeAxDiffB, ptr[regAux1]); + if (jcp.dataTypeSize != 1) + uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size + } + mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); + uni_vpbroadcastd(vmmSrcAfterBatchSizeB, ptr[regAux1]); - process(true, false); + process(true, false); L(lE1); jmp(lEnd, T_NEAR); } - L(lBlock); { + L(lBlock); + { mov(regAux1, ptr[regParams + GET_OFF(start)]); uni_vpbroadcastd(vmmAfterAxisIdxB, ptr[regAux1]); mov(regAux1, reinterpret_cast(incVec)); @@ -246,40 +280,40 @@ void jitUniGatherKernel::generate() { uni_vroundps(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, 0x1); uni_vfnmadd231ps(vmmAfterAxisIdxB, vmmSrcBeforeAxisSumB, vAux1); uni_vcvtps2dq(vmmAfterAxisIdxB, vmmAfterAxisIdxB); - uni_vpslld(vmmAfterAxisIdxB, vmmAfterAxisIdxB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmAfterAxisIdxB, vmmAfterAxisIdxB, idxTypeShift); // multiply by indices type size. Xbyak::Label lLessThanVector2, lTail3, lTail4, lE2; cmp(regAux2, dataElPerVec); jl(lLessThanVector2, T_NEAR); - uni_vmovd(reg32IdxIter, xmmSpecIdxB); - fillVlenVector(); + uni_vmovd(reg32IdxIter, xmmSpecIdxB); + fillVlenVector(); -// process(false, true); - jmp(lE2, T_NEAR); + // process(false, true); + jmp(lE2, T_NEAR); L(lLessThanVector2); - auto& vAux2 = vmmAuxContainer[2]; - // Calculate permute mask - uni_vmovd(xAux0, reg32Aux2); - uni_vpbroadcastd(vAux1, xAux0); - mov(regAux1, reinterpret_cast(&idxElPerVec)); - uni_vpbroadcastd(vAux0, ptr[regAux1]); - uni_vpsubd(vmmAfterAxisPermMask, vAux0, vAux1); - mov(regAux1, reinterpret_cast(incVec)); - uni_vpaddd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, ptr[regAux1]); - for (int i = 0; i < 6; i++) { - if (isa == x64::avx512_core) { - Xbyak::Opmask kMask2 = Xbyak::Opmask(vAux2.getIdx()); - vpcmpgtd(kMask2, vAux0, vmmAfterAxisPermMask); - uni_vpsubd(vmmAfterAxisPermMask | kMask2, vmmAfterAxisPermMask, vAux1); - } else { - vpcmpgtd(vAux2, vAux0, vmmAfterAxisPermMask); - vpandn(vAux2, vAux2, vAux1); - uni_vpsubd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, vAux2); - } + auto& vAux2 = vmmAuxContainer[2]; + // Calculate permute mask + uni_vmovd(xAux0, reg32Aux2); + uni_vpbroadcastd(vAux1, xAux0); + mov(regAux1, reinterpret_cast(&idxElPerVec)); + uni_vpbroadcastd(vAux0, ptr[regAux1]); + uni_vpsubd(vmmAfterAxisPermMask, vAux0, vAux1); + mov(regAux1, reinterpret_cast(incVec)); + uni_vpaddd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, ptr[regAux1]); + for (int i = 0; i < 6; i++) { + if (isa == x64::avx512_core) { + Xbyak::Opmask kMask2 = Xbyak::Opmask(vAux2.getIdx()); + vpcmpgtd(kMask2, vAux0, vmmAfterAxisPermMask); + uni_vpsubd(vmmAfterAxisPermMask | kMask2, vmmAfterAxisPermMask, vAux1); + } else { + vpcmpgtd(vAux2, vAux0, vmmAfterAxisPermMask); + vpandn(vAux2, vAux2, vAux1); + uni_vpsubd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, vAux2); } + } - process(true, true); + process(true, true); L(lE2); } L(lEnd); @@ -323,7 +357,7 @@ void jitUniGatherKernel::normalizeRawIndices(Vmm& vRawIndices, } // Check boundaries. vpcmpgtd(kAuxMask, vmmAxisDim, vRawIndices); - vpcmpd(kDstMask | kAuxMask, vmmZeros, vRawIndices, 2); // 2 - LE + vpcmpd(kDstMask | kAuxMask, vmmZeros, vRawIndices, 2); // 2 - LE // Multiply by type size. if (jcp.dataTypeSize > 1) uni_vpslld(vRawIndices, vRawIndices, dataTypeShift); @@ -338,7 +372,7 @@ void jitUniGatherKernel::normWithUpperBound(Vmm& vTarget, Vmm& vMax, template <> void jitUniGatherKernel::normWithUpperBound(Vmm& vTarget, Vmm& vMax, Vmask& kAuxMask) { - vpcmpd(kAuxMask, vMax, vTarget, 2); // 2 -> LE + vpcmpd(kAuxMask, vMax, vTarget, 2); // 2 -> LE uni_vpsubd(vTarget | kAuxMask, vTarget, vMax); } @@ -359,77 +393,77 @@ void jitUniGatherKernel::calcSrcShiftLong(Vmm* vAuxPool, bool shiftFi add(regIdxIter, vlen); cmp(regIdxIter, regSpecIdxSizeB); jge(lIdxStride, T_NEAR); + if (jcp.batchDims > 0lu) { + uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); + uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + } else { + uni_vmovd(reg32Aux1, xmmSpecIdxB); + } + vmovdqu(vDstShifts, ptr[regIndices + regAux1]); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (jcp.beforeAxisSize != 1lu) + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + jmp(lExit, T_NEAR); + L(lIdxStride); + sub(regIdxIter, regSpecIdxSizeB); + vpcmpeqd(kDstMask, vAux0, vAux0); + if (shiftFirst) { + vpcmpgtd(vAux0, vmmSpecIdxSizeB, vmmSpecIdxB); + vpandn(vAux1, vAux0, vmmSpecIdxSizeB); + uni_vpsubd(vAux1, vmmSpecIdxB, vAux1); + if (jcp.batchDims > 0lu) + uni_vpaddd(vAux1, vmmIdxBatchSumB, vAux1); + uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); + } else { if (jcp.batchDims > 0lu) { - uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); - uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); } else { - uni_vmovd(reg32Aux1, xmmSpecIdxB); + uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); } - vmovdqu(vDstShifts, ptr[regIndices + regAux1]); normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - jmp(lExit, T_NEAR); - L(lIdxStride); - sub(regIdxIter, regSpecIdxSizeB); - vpcmpeqd(kDstMask, vAux0, vAux0); - if (shiftFirst) { - vpcmpgtd(vAux0, vmmSpecIdxSizeB, vmmSpecIdxB); - vpandn(vAux1, vAux0, vmmSpecIdxSizeB); - uni_vpsubd(vAux1, vmmSpecIdxB, vAux1); - if (jcp.batchDims > 0lu) - uni_vpaddd(vAux1, vmmIdxBatchSumB, vAux1); - uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); - } else { - if (jcp.batchDims > 0lu) { - uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); - } else { - uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); - } - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - uni_vpbroadcastd(vAux0, xmmSpecIdxB); - vpcmpgtd(vAux1, vAux0, vmmSpecIdxB); - vpandn(vAux0, vAux1, vmmSpecIdxSizeB); - uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vAux0); + uni_vpbroadcastd(vAux0, xmmSpecIdxB); + vpcmpgtd(vAux1, vAux0, vmmSpecIdxB); + vpandn(vAux0, vAux1, vmmSpecIdxSizeB); + uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vAux0); - if (jcp.beforeAxisSize != 1lu) { - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - vpandn(vAux0, vAux1, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vAux0); - } + if (jcp.beforeAxisSize != 1lu) { + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + vpandn(vAux0, vAux1, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vAux0); } + } - if (jcp.batchDims > 0lu) { - Xbyak::Label l1; - inc(regBetweenBatchAndAxisIter); - cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); - jl(l1, T_NEAR); - mov(regBetweenBatchAndAxisIter, 0); - if (shiftFirst) { - uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); - vpandn(vDstShifts, vAux0, vmmSpecIdxSizeB); - uni_vpaddd(vAux1, vAux1, vDstShifts); - } else { - vpandn(vAux0, vAux1, vmmSpecIdxSizeB); - uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vAux0); - } - L(l1); + if (jcp.batchDims > 0lu) { + Xbyak::Label l1; + inc(regBetweenBatchAndAxisIter); + cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); + jl(l1, T_NEAR); + mov(regBetweenBatchAndAxisIter, 0); + if (shiftFirst) { + uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); + vpandn(vDstShifts, vAux0, vmmSpecIdxSizeB); + uni_vpaddd(vAux1, vAux1, vDstShifts); + } else { + vpandn(vAux0, vAux1, vmmSpecIdxSizeB); + uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vAux0); } + L(l1); + } - if (shiftFirst) { - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (shiftFirst) { + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) { - vpandn(vAux0, vAux0, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vAux0, vAux0, vmmSrcBeforeAxisSumB); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + if (jcp.beforeAxisSize != 1lu) { + vpandn(vAux0, vAux0, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vAux0, vAux0, vmmSrcBeforeAxisSumB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vDstShifts, vDstShifts, vAux0); - } + uni_vpaddd(vDstShifts, vDstShifts, vAux0); } + } L(lExit); } @@ -451,81 +485,81 @@ void jitUniGatherKernel::calcSrcShiftLong(Vmm* vAuxPool, bool add(regIdxIter, vlen); cmp(regIdxIter, regSpecIdxSizeB); jge(lIdxStride, T_NEAR); + if (jcp.batchDims > 0lu) { + uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); + uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + } else { + uni_vmovd(reg32Aux1, xmmSpecIdxB); + } + vmovdqu64(vDstShifts, ptr[regIndices + regAux1]); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (jcp.beforeAxisSize != 1lu) + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + jmp(lExit, T_NEAR); + L(lIdxStride); + sub(regIdxIter, regSpecIdxSizeB); + vpcmpeqd(kDstMask, vDstShifts, vDstShifts); + if (shiftFirst) { + vpcmpd(kAuxMask1, vmmSpecIdxSizeB, vmmSpecIdxB, 2); // 2 -> LE if (jcp.batchDims > 0lu) { - uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); - uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + uni_vpaddd(vAux1, vmmIdxBatchSumB, vmmSpecIdxB); + uni_vpsubd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); } else { - uni_vmovd(reg32Aux1, xmmSpecIdxB); + uni_vmovups(vAux1, vmmSpecIdxB); + uni_vpsubd(vAux1 | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); } - vmovdqu64(vDstShifts, ptr[regIndices + regAux1]); - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - jmp(lExit, T_NEAR); - L(lIdxStride); - sub(regIdxIter, regSpecIdxSizeB); - vpcmpeqd(kDstMask, vDstShifts, vDstShifts); - if (shiftFirst) { - vpcmpd(kAuxMask1, vmmSpecIdxSizeB, vmmSpecIdxB, 2); // 2 -> LE - if (jcp.batchDims > 0lu) { - uni_vpaddd(vAux1, vmmIdxBatchSumB, vmmSpecIdxB); - uni_vpsubd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); - } else { - uni_vmovups(vAux1, vmmSpecIdxB); - uni_vpsubd(vAux1 | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); - } - uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); + uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); + } else { + if (jcp.batchDims > 0lu) { + uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); } else { - if (jcp.batchDims > 0lu) { - uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); - } else { - uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); - } - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); + } + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - uni_vpbroadcastd(vAux0, xmmSpecIdxB); - vpcmpd(kAuxMask1, vAux0, vmmSpecIdxB, 2); // 2 -> LE - uni_vpsubd(vmmSpecIdxB | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); + uni_vpbroadcastd(vAux0, xmmSpecIdxB); + vpcmpd(kAuxMask1, vAux0, vmmSpecIdxB, 2); // 2 -> LE + uni_vpsubd(vmmSpecIdxB | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); - if (jcp.beforeAxisSize != 1lu) { - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - uni_vpaddd(vmmSrcBeforeAxisSumB | kAuxMask1, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - } + if (jcp.beforeAxisSize != 1lu) { + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + uni_vpaddd(vmmSrcBeforeAxisSumB | kAuxMask1, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); } + } - if (jcp.batchDims > 0lu) { - Xbyak::Label l1; - inc(regBetweenBatchAndAxisIter); - cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); - jl(l1, T_NEAR); - mov(regBetweenBatchAndAxisIter, 0); - if (shiftFirst) { - uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); - uni_vpaddd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); - } else { - uni_vpaddd(vmmIdxBatchSumB | kAuxMask1, vmmIdxBatchSumB, vmmSpecIdxSizeB); - } - L(l1); + if (jcp.batchDims > 0lu) { + Xbyak::Label l1; + inc(regBetweenBatchAndAxisIter); + cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); + jl(l1, T_NEAR); + mov(regBetweenBatchAndAxisIter, 0); + if (shiftFirst) { + uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); + uni_vpaddd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); + } else { + uni_vpaddd(vmmIdxBatchSumB | kAuxMask1, vmmIdxBatchSumB, vmmSpecIdxSizeB); } + L(l1); + } - if (shiftFirst) { - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (shiftFirst) { + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) { - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - uni_vpaddd(vDstShifts | kAuxMask1, vDstShifts, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - } + if (jcp.beforeAxisSize != 1lu) { + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + uni_vpaddd(vDstShifts | kAuxMask1, vDstShifts, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); } + } L(lExit); } template void jitUniGatherKernel::calcSrcShiftLongBlock(Vmm* vAuxPool, bool shiftFirst) { - // Most likely there will no significant performance gain vs memcpy in reference implementation on big blocks after axis, - // therefore no time was invested to this case yet. + // Most likely there will no significant performance gain vs memcpy in reference implementation on big blocks after + // axis, therefore no time was invested to this case yet. OPENVINO_THROW("Unsupported case."); } @@ -541,7 +575,8 @@ void jitUniGatherKernel::calcSrcShiftShort(Vmm* vAuxPool, bool shiftFirst) if (jcp.beforeAxisSize != 1lu) uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmBeforeAxDiffB); // No sense to permute if specIdxSize is one of {1, 2, 4, 8, 16}. 0 is reserved for dynamic case. - if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && jcp.specIdxSize != 16) { + if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && + jcp.specIdxSize != 16) { vpermd(vmmSpecIdxB, vmmPermIdxMask, vmmSpecIdxB); if (jcp.beforeAxisSize != 1lu) vpermd(vmmBeforeAxDiffB, vmmPermIdxMask, vmmBeforeAxDiffB); @@ -588,7 +623,8 @@ void jitUniGatherKernel::calcSrcShiftShortBlock(Vmm* vAuxPool, bool shiftFi normWithUpperBound(vmmSpecIdxB, vmmSpecIdxSizeB, kAuxMask0); } // No sense to permute if afterAxisSize is one of {1, 2, 4, 8, 16}. 0 is reserved for dynamic case. - if (jcp.afterAxisSize != 1 && jcp.afterAxisSize != 2 && jcp.afterAxisSize != 4 && jcp.afterAxisSize != 8 && jcp.afterAxisSize != 16) { + if (jcp.afterAxisSize != 1 && jcp.afterAxisSize != 2 && jcp.afterAxisSize != 4 && jcp.afterAxisSize != 8 && + jcp.afterAxisSize != 16) { vpermd(vmmAfterAxisIdxB, vmmAfterAxisPermMask, vmmAfterAxisIdxB); if (jcp.specIdxSize != 1) vpermd(vmmSpecIdxDiff, vmmAfterAxisPermMask, vmmSpecIdxDiff); @@ -600,33 +636,33 @@ void jitUniGatherKernel::calcSrcShiftShortBlock(Vmm* vAuxPool, bool shiftFi uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmBeforeAxDiffB); uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); if (specIdxAndAfterAxisSize != 1 && specIdxAndAfterAxisSize != 2 && specIdxAndAfterAxisSize != 4 && - specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) + specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) vpermd(vmmBeforeAxDiffB, vmmBeforeAxPermMask, vmmBeforeAxDiffB); } else { Xbyak::Label lBeforeAxStep, lBeforeAxStepEnd; add(rSpecIdxAndAfterAxIterB, idxElPerVec * jcp.dataTypeSize); cmp(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); jl(lBeforeAxStep, T_NEAR); - sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); - - vpmulld(vAux0, vmmSpecIdxB, vmmAfterAxisSize); - uni_vpaddd(vAux0, vAux0, vmmAfterAxisIdxB); - Xbyak::Xmm& xAux0 = xmmAuxContainer[vAux0.getIdx()]; - uni_vpbroadcastd(vAux1, xAux0); - if (isa == x64::avx512_core) { - Xbyak::Opmask kMask0 = Xbyak::Opmask(kAuxMask0.getIdx()); - vpcmpgtd(kMask0, vAux1, vAux0); - uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); - uni_vpaddd(vAux1 | kMask0, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - } else { - vpcmpgtd(vAux1, vAux1, vAux0); - vpand(vAux1, vAux1, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vAux1, vmmSrcBeforeAxisSumB, vAux1); - } - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - jmp(lBeforeAxStepEnd); - L(lBeforeAxStep); + sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); + + vpmulld(vAux0, vmmSpecIdxB, vmmAfterAxisSize); + uni_vpaddd(vAux0, vAux0, vmmAfterAxisIdxB); + Xbyak::Xmm& xAux0 = xmmAuxContainer[vAux0.getIdx()]; + uni_vpbroadcastd(vAux1, xAux0); + if (isa == x64::avx512_core) { + Xbyak::Opmask kMask0 = Xbyak::Opmask(kAuxMask0.getIdx()); + vpcmpgtd(kMask0, vAux1, vAux0); uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); + uni_vpaddd(vAux1 | kMask0, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + } else { + vpcmpgtd(vAux1, vAux1, vAux0); + vpand(vAux1, vAux1, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vAux1, vmmSrcBeforeAxisSumB, vAux1); + } + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + jmp(lBeforeAxStepEnd); + L(lBeforeAxStep); + uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); L(lBeforeAxStepEnd); } } else { @@ -648,10 +684,10 @@ void jitUniGatherKernel::calcSrcShiftShortBlock(Vmm* vAuxPool, bool shiftFi add(rSpecIdxAndAfterAxIterB, idxElPerVec * jcp.dataTypeSize); cmp(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); jl(lBeforeAxStepEnd1, T_NEAR); - sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); + sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); cmp(rSpecIdxAndAfterAxIterB, 0); jne(lBeforeAxStepEnd1, T_NEAR); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); L(lBeforeAxStepEnd1); } } @@ -689,15 +725,15 @@ void jitUniGatherKernel::process(bool isShortIdx, bool blocked) { Xbyak::Label lTailProc, lEndProc; cmp(regWorkAmount, dataElPerVec); jl(lTailProc, T_NEAR); - if (jcp.dataTypeSize == 4) - process32b(isShortIdx, blocked); - else if (jcp.dataTypeSize == 2) - process16b(isShortIdx, blocked); - else if (jcp.dataTypeSize == 1) - process8b(isShortIdx, blocked); + if (jcp.dataTypeSize == 4) + process32b(isShortIdx, blocked); + else if (jcp.dataTypeSize == 2) + process16b(isShortIdx, blocked); + else if (jcp.dataTypeSize == 1) + process8b(isShortIdx, blocked); jmp(lEndProc, T_NEAR); L(lTailProc); - tail(isShortIdx, false, blocked); + tail(isShortIdx, false, blocked); L(lEndProc); } @@ -735,11 +771,11 @@ void jitUniGatherKernel::process16b(bool isShortIdx, bool blocked) { if (isa == x64::avx512_core) { vPermMask = vmmAuxContainer[7]; vShufMask = vmmAuxContainer[8]; - vBuff0 = vmmAuxContainer[9]; + vBuff0 = vmmAuxContainer[9]; } else { vPermMask = vmmAuxContainer[1]; vShufMask = vmmAuxContainer[4]; - vBuff0 = vmmAuxContainer[5]; + vBuff0 = vmmAuxContainer[5]; } mov(regAux1, reinterpret_cast(shufMask16bitUni)); @@ -799,13 +835,13 @@ void jitUniGatherKernel::process8b(bool isShortIdx, bool blocked) { if (isa == x64::avx512_core) { vPermMask = vmmAuxContainer[7]; vShufMask = vmmAuxContainer[8]; - vBuff0 = vmmAuxContainer[9]; - vBuff1 = vmmAuxContainer[10]; + vBuff0 = vmmAuxContainer[9]; + vBuff1 = vmmAuxContainer[10]; } else { vPermMask = vmmAuxContainer[1]; vShufMask = vmmAuxContainer[4]; - vBuff0 = vmmAuxContainer[5]; - vBuff1 = vmmAuxContainer[6]; + vBuff0 = vmmAuxContainer[5]; + vBuff1 = vmmAuxContainer[6]; } mov(regAux1, reinterpret_cast(shufMask8bitUni)); uni_vmovups(vShufMask, ptr[regAux1]); @@ -951,24 +987,30 @@ void jitUniGatherKernel::tail(bool isShortIdx, bool shiftFirst, bool blocke } template <> -void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, Vmm& vmmAux, const Xbyak::Reg64& rWorkRest, - const Xbyak::Reg64& rAux0, const Xbyak::Reg64& rAux1) { +void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, + Vmm& vmmAux, + const Xbyak::Reg64& rWorkRest, + const Xbyak::Reg64& rAux0, + const Xbyak::Reg64& rAux1) { Xbyak::Label lKmov; Xbyak::Reg32 rOnes(rAux1.getIdx()); mov(rOnes, 0x0000FFFF); cmp(rWorkRest, idxElPerVec); jge(lKmov); - Xbyak::Reg8 rShift(Xbyak::Operand::CL); - mov(rShift, idxElPerVec); - sub(rShift, rWorkRest); - shr(rOnes, rShift); + Xbyak::Reg8 rShift(Xbyak::Operand::CL); + mov(rShift, idxElPerVec); + sub(rShift, rWorkRest); + shr(rOnes, rShift); L(lKmov); kmovw(kDstMask, rOnes); } template <> -void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, Vmm& vAux, const Xbyak::Reg64& rWorkRest, - const Xbyak::Reg64& rAux0, const Xbyak::Reg64& rAux1) { +void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, + Vmm& vAux, + const Xbyak::Reg64& rWorkRest, + const Xbyak::Reg64& rAux0, + const Xbyak::Reg64& rAux1) { Xbyak::Label lEnd; mov(rAux0, rWorkRest); Xbyak::Reg32 rOnes(rAux1.getIdx()); @@ -990,7 +1032,10 @@ void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, Vmm& vAux, } template -void jitUniGatherKernel::storeVectorPart(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rToStoreCounter, Vmm& vmmSrc, Vmm& vAux) { +void jitUniGatherKernel::storeVectorPart(const Xbyak::Reg64& rDst, + const Xbyak::Reg64& rToStoreCounter, + Vmm& vmmSrc, + Vmm& vAux) { Xbyak::Label lEnd; Xbyak::Xmm xAux(vAux.getIdx()); for (size_t j = 0; j < vlen / vlenXmm; j++) { @@ -1025,7 +1070,7 @@ void jitUniGatherKernel::fillVlenVector() { template <> void jitUniGatherKernel::fillVlenVector() { vpcmpeqd(vmmVecLenB, vmmVecLenB, vmmVecLenB); - vpsrld(vmmVecLenB, vmmVecLenB, 31); // Right shift to 1. + vpsrld(vmmVecLenB, vmmVecLenB, 31); // Right shift to 1. uni_vpslld(vmmVecLenB, vmmVecLenB, 5); // Left shift to 32. } @@ -1047,5 +1092,5 @@ bool jitUniGatherKernel::isSupportedConfiguration(uint64_t afterAxisSize) { template struct jitUniGatherKernel; template struct jitUniGatherKernel; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp index 765efb17d091e2..de8cda30d06499 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp @@ -19,12 +19,11 @@ // 1 | X | X | X | X | X | X | //-------------------------------------------------------------- - #pragma once -#include "jit_kernel_base.hpp" #include "cpu/x64/jit_generator.hpp" #include "dnnl_types.h" +#include "jit_kernel_base.hpp" namespace ov { namespace intel_cpu { @@ -71,8 +70,8 @@ struct gatherJitExecArgs { }; struct jitGatherKernelBase { - void (*ker_)(const gatherJitExecArgs *); - void operator()(const gatherJitExecArgs *args) { + void (*ker_)(const gatherJitExecArgs*); + void operator()(const gatherJitExecArgs* args) { assert(ker_); ker_(args); } @@ -120,8 +119,10 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: bool isSupportedConfiguration(uint64_t afterAxisSize) override; protected: - using Vmm = typename dnnl::impl::utils::conditional::type; - using Vmask = typename dnnl::impl::utils::conditional::type; + using Vmm = + typename dnnl::impl::utils::conditional::type; + using Vmask = + typename dnnl::impl::utils::conditional::type; static const uint32_t vlenXmm = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; static const uint32_t indicesTypeSize = sizeof(uint32_t); static const uint8_t idxTypeShift = 2; @@ -155,7 +156,8 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: // Masks pool. Do not use k0 with gather instruction! Vmask masksContainer[8] = {Vmask(0), Vmask(1), Vmask(2), Vmask(3), Vmask(4), Vmask(5), Vmask(6), Vmask(7)}; // Auxiliary pool. - Vmm vmmAuxContainer[12] = {Vmm(0), Vmm(1), Vmm(2), Vmm(3), Vmm(4), Vmm(5), Vmm(6), /*AVX5*/ Vmm(16), Vmm(17), Vmm(18), Vmm(19), Vmm(20)}; + Vmm vmmAuxContainer[12] = + {Vmm(0), Vmm(1), Vmm(2), Vmm(3), Vmm(4), Vmm(5), Vmm(6), /*AVX5*/ Vmm(16), Vmm(17), Vmm(18), Vmm(19), Vmm(20)}; // Common. Vmm vmmZeros = Vmm(7); Vmm vmmSrcBeforeAxisSumB = Vmm(8); @@ -165,13 +167,13 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: Vmm vmmAxisAndAfterAxisSizeB = Vmm(12); // Only short. - Vmm vmmSrcAfterBatchSizeB = Vmm(13); - Vmm vmmPermIdxMask = Vmm(14); + Vmm vmmSrcAfterBatchSizeB = Vmm(13); + Vmm vmmPermIdxMask = Vmm(14); Vmm& vmmBeforeAxDiffB = vmmAxisAndAfterAxisSizeB; // Blocked short. Vmm& vmmSpecIdxDiff = vmmAuxContainer[4]; Vmm& vmmAfterAxisSize = vmmAuxContainer[5]; - Vmm vmmAfterAxisIdxB = Vmm(15); + Vmm vmmAfterAxisIdxB = Vmm(15); Vmm& vmmAfterAxisPermMask = vmmPermIdxMask; Vmm& vmmBeforeAxPermMask = vmmAuxContainer[6]; // Only long. @@ -179,13 +181,13 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: Vmm vmmIdxBatchSumB = Vmm(14); // XMM - Xbyak::Xmm xmmAuxContainer[6] = {Xbyak::Xmm(0), Xbyak::Xmm(1), Xbyak::Xmm(2), Xbyak::Xmm(3), Xbyak::Xmm(4), Xbyak::Xmm(16)}; + Xbyak::Xmm xmmAuxContainer[6] = + {Xbyak::Xmm(0), Xbyak::Xmm(1), Xbyak::Xmm(2), Xbyak::Xmm(3), Xbyak::Xmm(4), Xbyak::Xmm(16)}; Xbyak::Xmm xmmZeros = Xbyak::Xmm(vmmZeros.getIdx()); Xbyak::Xmm xmmSrcBeforeAxisSum = Xbyak::Xmm(vmmSrcBeforeAxisSumB.getIdx()); Xbyak::Xmm xmmSpecIdxSizeB = Xbyak::Xmm(vmmSpecIdxSizeB.getIdx()); Xbyak::Xmm xmmSpecIdxB = Xbyak::Xmm(vmmSpecIdxB.getIdx()); - void calcSrcShiftLong(Vmm* vAuxPool, bool shiftFirst = true); void calcSrcShiftLongBlock(Vmm* vAuxPool, bool shiftFirst = true); void calcSrcShiftShort(Vmm* vAuxPool, bool shiftFirst = true); @@ -199,7 +201,11 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: // Aux functions. void normalizeRawIndices(Vmm& rawIndices, Vmask& dstMask, Vmask& aux); void normWithUpperBound(Vmm& vTarget, Vmm& vMax, Vmask& kAuxMask); - void fillRestWorkMask(Vmask& kMask, Vmm& vAux, const Xbyak::Reg64& rWorkRest, const Xbyak::Reg64& rAux0, const Xbyak::Reg64& rAux1); + void fillRestWorkMask(Vmask& kMask, + Vmm& vAux, + const Xbyak::Reg64& rWorkRest, + const Xbyak::Reg64& rAux0, + const Xbyak::Reg64& rAux1); void storeVectorPart(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rToStoreCounter, Vmm& vmmSrc, Vmm& vAux); void uniVpGatherDd(Vmm& vDst, const Xbyak::Address& srcAddr, Vmask& vMask); void fillVlenVector(); @@ -208,5 +214,5 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: const unsigned* permMask16bitUni; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp index d91688689b86c0..908de00cbb0534 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp @@ -13,8 +13,8 @@ namespace kernel { #define GET_OFF(field) offsetof(GridSamplesKernelExecArgs, field) template -GridSampleKernel::GridSampleKernel(const GridSampleKernelConfParams& jcp) : - GridSampleKernelBase(jit_name(), jcp, isa) { +GridSampleKernel::GridSampleKernel(const GridSampleKernelConfParams& jcp) + : GridSampleKernelBase(jit_name(), jcp, isa) { vlen = x64::cpu_isa_traits::vlen; dataTypeSize = jcp.inDataPrc.size(); gridTypeSize = jcp.gridPrc.size(); @@ -39,15 +39,15 @@ void GridSampleKernel::generate() { this->preamble(); registersPool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); - regSrc = getReg64(); + regSrc = getReg64(); regGrid = getReg64(); - regDst = getReg64(); + regDst = getReg64(); regSrcChannelStepB = getReg64(); regDstChannelStepB = getReg64(); - mov(regSrc, ptr[regParams + GET_OFF(src)]); + mov(regSrc, ptr[regParams + GET_OFF(src)]); mov(regGrid, ptr[regParams + GET_OFF(grid)]); - mov(regDst, ptr[regParams + GET_OFF(dst)]); + mov(regDst, ptr[regParams + GET_OFF(dst)]); mov(regSrcChannelStepB, ptr[regParams + GET_OFF(srcChannelStepB)]); mov(regDstChannelStepB, ptr[regParams + GET_OFF(dstChannelStepB)]); @@ -82,7 +82,7 @@ void GridSampleKernel::initVectors() { if (one_of(jcp.interpolationMode, GridSampleInterpolationMode::BICUBIC, GridSampleInterpolationMode::BILINEAR)) { vOnesF = getVmm(); - mov(r32Aux, 0x3f800000); // 1.f + mov(r32Aux, 0x3f800000); // 1.f vpbroadcastd(vOnesF, r32Aux); } @@ -96,11 +96,11 @@ void GridSampleKernel::initVectors() { uni_vpbroadcastd(vHDenormCoefF, ptr[rAux]); } else { vHalfF = getVmm(); - mov(r32Aux, 0x3f000000); // 0.5f + mov(r32Aux, 0x3f000000); // 0.5f vpbroadcastd(vHalfF, r32Aux); } - static const unsigned gridPermMask[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; + static const unsigned gridPermMask[16] = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; mov(rAux, reinterpret_cast(gridPermMask)); vGridPermMask = getVmm(); uni_vmovups(vGridPermMask, ptr[rAux]); @@ -141,24 +141,24 @@ void GridSampleKernel::initVectors() { if (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC) { vConst_0_75 = getVmm(); - mov(r32Aux, 0xbf400000); // -0.75f + mov(r32Aux, 0xbf400000); // -0.75f vpbroadcastd(vConst_0_75, r32Aux); vConst_1_25 = getVmm(); - mov(r32Aux, 0x3fa00000); // 1.25f + mov(r32Aux, 0x3fa00000); // 1.25f vpbroadcastd(vConst_1_25, r32Aux); vConst_1_50 = getVmm(); - mov(r32Aux, 0x3fc00000); // 1.5f + mov(r32Aux, 0x3fc00000); // 1.5f vpbroadcastd(vConst_1_50, r32Aux); vConst_2_00 = getVmm(); - mov(r32Aux, 0x40000000); // 2.0f + mov(r32Aux, 0x40000000); // 2.0f vpbroadcastd(vConst_2_00, r32Aux); vConst_2_25 = getVmm(); - mov(r32Aux, 0x40100000); // 2.25f + mov(r32Aux, 0x40100000); // 2.25f vpbroadcastd(vConst_2_25, r32Aux); } } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::initVectors() { auto rAux = getReg64(); @@ -167,9 +167,10 @@ void GridSampleKernel::initVectors() { uni_vmovups(vSrcWidthF, ptr[rAux]); if (one_of(jcp.interpolationMode, GridSampleInterpolationMode::BILINEAR, GridSampleInterpolationMode::NEAREST) || - (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC && (jcp.paddingMode == GridSamplePaddingMode::REFLECTION || - (jcp.paddingMode == GridSamplePaddingMode::BORDER && !jcp.alignCorners) || - jcp.paddingMode == GridSamplePaddingMode::ZEROS)) ) { + (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC && + (jcp.paddingMode == GridSamplePaddingMode::REFLECTION || + (jcp.paddingMode == GridSamplePaddingMode::BORDER && !jcp.alignCorners) || + jcp.paddingMode == GridSamplePaddingMode::ZEROS))) { vSrcHeightF = getVmm(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); uni_vmovups(vSrcHeightF, ptr[rAux]); @@ -184,7 +185,8 @@ void GridSampleKernel::initVectors() { if (jcp.interpolationMode != GridSampleInterpolationMode::BICUBIC) { if (one_of(jcp.paddingMode, GridSamplePaddingMode::BORDER, GridSamplePaddingMode::ZEROS) && - ((isa == x64::avx2 && jcp.interpolationMode == GridSampleInterpolationMode::NEAREST) || one_of(isa, x64::avx, x64::sse41))) { + ((isa == x64::avx2 && jcp.interpolationMode == GridSampleInterpolationMode::NEAREST) || + one_of(isa, x64::avx, x64::sse41))) { vZeros = getVmm(); uni_vpxor(vZeros, vZeros, vZeros); } @@ -193,20 +195,21 @@ void GridSampleKernel::initVectors() { mov(rAux, ptr[regParams + GET_OFF(wDenormCoefF)]); vWDenormCoefF = getVmm(); uni_vmovups(vWDenormCoefF, ptr[rAux]); - if (!(jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && jcp.paddingMode == GridSamplePaddingMode::ZEROS)) { + if (!(jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && + jcp.paddingMode == GridSamplePaddingMode::ZEROS)) { mov(rAux, ptr[regParams + GET_OFF(hDenormCoefF)]); vHDenormCoefF = getVmm(); uni_vmovups(vHDenormCoefF, ptr[rAux]); } } else { - static const float halfArr[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + static const float halfArr[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; mov(rAux, reinterpret_cast(halfArr)); vHalfF = getVmm(); uni_vmovups(vHalfF, ptr[rAux]); } if (isa == x64::avx2 && jcp.interpolationMode == GridSampleInterpolationMode::NEAREST) { - static const unsigned gridPermMask[8] = { 0, 2, 4, 6, 1, 3, 5, 7 }; + static const unsigned gridPermMask[8] = {0, 2, 4, 6, 1, 3, 5, 7}; mov(rAux, reinterpret_cast(gridPermMask)); vGridPermMask = getVmm(); uni_vmovups(vGridPermMask, ptr[rAux]); @@ -214,15 +217,16 @@ void GridSampleKernel::initVectors() { } if (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC || - (jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && jcp.paddingMode != GridSamplePaddingMode::ZEROS)) { - static const float onesArr[8] = { 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f }; + (jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && + jcp.paddingMode != GridSamplePaddingMode::ZEROS)) { + static const float onesArr[8] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}; mov(rAux, reinterpret_cast(onesArr)); vOnesF = getVmm(); uni_vmovups(vOnesF, ptr[rAux]); } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::process() { regWorkAmount = getReg64(); @@ -244,12 +248,12 @@ void GridSampleKernel::process() { spatialLoop(); if (jcp.dynamicShapes) { - add(regSrc, ptr[regParams + GET_OFF(srcBatchStepB)]); + add(regSrc, ptr[regParams + GET_OFF(srcBatchStepB)]); } else { add(regSrc, jcp.srcBatchStepB); } add(regGrid, ptr[regParams + GET_OFF(gridBatchStepB)]); - add(regDst, ptr[regParams + GET_OFF(dstBatchStepB)]); + add(regDst, ptr[regParams + GET_OFF(dstBatchStepB)]); if (jcp.dynamicBatch) { dec(regBatch); @@ -259,7 +263,7 @@ void GridSampleKernel::process() { } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::spatialLoop() { auto vHCoord = getVmm(); auto vWCoord = getVmm(); @@ -286,7 +290,7 @@ void GridSampleKernel::spatialLoop() { tail(); } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::interpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { if (jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR) { bilinearInterpolation(vWCoord, vHCoord, tail); @@ -297,7 +301,7 @@ void GridSampleKernel::interpolation(const Vmm& vWCoord, const Vmm& vHCoord } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::tail() { Xbyak::Label lEnd; cmp(regWorkAmount, 0); @@ -311,7 +315,7 @@ void GridSampleKernel::tail() { interpolation(vWCoord, vHCoord, true); if (dataTypeSize > 1) - sal(regWorkAmount, dataTypeShift); // Multiply by source data type size. + sal(regWorkAmount, dataTypeShift); // Multiply by source data type size. add(regDst, regWorkAmount); L(lEnd); @@ -319,15 +323,15 @@ void GridSampleKernel::tail() { template <> void GridSampleKernel::getCoordinates(const Vmm& vHCoord, const Vmm& vWCoord) { - vpermd(vWCoord, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY - vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component + vpermd(vWCoord, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY + vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component add(regGrid, vlen); auto vAux = getVmm(); - vpermd(vAux, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY - vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component - vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component + vpermd(vAux, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY + vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component + vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component add(regGrid, vlen); } @@ -349,19 +353,19 @@ void GridSampleKernel::getCoordinates(const Vmm& vHCoord, const Vmm& uni_vmovups(vPermMask, ptr[rAux]); } - vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY - vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component + vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY + vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component add(regGrid, vlen); - vpermd(vAux, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY - vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component - vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component + vpermd(vAux, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY + vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component + vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component add(regGrid, vlen); } -template // Works for AVX, SSE41 +template // Works for AVX, SSE41 void GridSampleKernel::getCoordinates(const Vmm& vHCoord, const Vmm& vWCoord) { auto vAux = getVmm(); Xbyak::Xmm xmmWCoord(vWCoord.getIdx()); @@ -417,12 +421,12 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, auto rAux = getReg64(); mov(rAux, regWorkAmount); - sal(rAux, 0x1); // Multiply by gridShape[3]. + sal(rAux, 0x1); // Multiply by gridShape[3]. cmp(regWorkAmount, dataElPerVec / 2); jl(lRest, T_NEAR); { vpermd(vWCoord, vGridPermMask, ptr[regGrid]); - vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component + vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component add(regGrid, vlen); sub(rAux, dataElPerVec); @@ -433,8 +437,8 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, uni_vmovups((Vmm)vAux | kTailMask, ptr[regGrid]); vpermd(vAux, vGridPermMask, vAux); Xbyak::Ymm ymmAux(vAux.getIdx()); - vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component - vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component + vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component + vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component jmp(lGridShift, T_NEAR); } @@ -443,12 +447,12 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, fillRestWorkMask(kTailMask, rAux); uni_vmovups(vWCoord | kTailMask, ptr[regGrid]); vpermd(vWCoord, vGridPermMask, vWCoord); - vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component + vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component } L(lGridShift); if (dataTypeSize > 1) - sal(rAux, dataTypeShift); // Multiply by source data type size. + sal(rAux, dataTypeShift); // Multiply by source data type size. add(regGrid, rAux); L(lEnd); @@ -475,36 +479,36 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const V } mov(rAux, regWorkAmount); - sal(rAux, 0x1); // multiply by gridShape[3] == 2 + sal(rAux, 0x1); // multiply by gridShape[3] == 2 cmp(regWorkAmount, dataElPerVec / 2); jl(lRest, T_NEAR); { - vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY - vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component + vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY + vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component add(regGrid, vlen); sub(rAux, dataElPerVec); cmp(rAux, 0); jle(lEnd, T_NEAR); - auto vAux = getVmm(); + auto vAux = getVmm(); load(vAux, ptr[regGrid], rAux, dataTypeSize); vpermd(vAux, vPermMask, vAux); - vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component - vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component + vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component + vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component jmp(lGridShift, T_NEAR); } L(lRest); { load(vWCoord, ptr[regGrid], rAux, dataTypeSize); - vpermd(vWCoord, vPermMask, vWCoord); // Permute to XXXX.YYYY - vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component + vpermd(vWCoord, vPermMask, vWCoord); // Permute to XXXX.YYYY + vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component } L(lGridShift); if (dataTypeSize > 1) - sal(rAux, dataTypeShift); // Multiply by source data type size. + sal(rAux, dataTypeShift); // Multiply by source data type size. add(regGrid, rAux); L(lEnd); @@ -519,7 +523,7 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const Vm auto rGridRest = getReg64(); mov(rGridRest, regWorkAmount); - sal(rGridRest, 0x1); // multiply by gridShape[3] == 2 + sal(rGridRest, 0x1); // multiply by gridShape[3] == 2 for (size_t i = 0; i < dataElPerVec; i++) { cmp(rGridRest, 0); @@ -566,7 +570,7 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const auto rAux = getReg64(); mov(rAux, regWorkAmount); - sal(rAux, 0x1); // Multiply by gridShape[3] == 2 + sal(rAux, 0x1); // Multiply by gridShape[3] == 2 cmp(regWorkAmount, dataElPerVec / 2); jl(lRest, T_NEAR); { @@ -584,31 +588,31 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const auto vAux = getVmm(); load(vAux, ptr[regGrid], rAux, dataTypeSize); pshufd(vAux, vAux, 0B11011000); - shufpd(vWCoord, vAux, 0x0); // Extract X component - shufpd(vHCoord, vAux, 0B00000011); // Extract Y component + shufpd(vWCoord, vAux, 0x0); // Extract X component + shufpd(vHCoord, vAux, 0B00000011); // Extract Y component jmp(lGridShift, T_NEAR); L(lHShuf); - shufpd(vHCoord, vHCoord, 0B00000001); // Extract Y component + shufpd(vHCoord, vHCoord, 0B00000001); // Extract Y component jmp(lEnd, T_NEAR); } L(lRest); { load(vWCoord, ptr[regGrid], rAux, dataTypeSize); - pshufd(vWCoord, vWCoord, 0B11011000); // Extract X component - shufpd(vHCoord, vWCoord, 0B00000010); // Extract Y component + pshufd(vWCoord, vWCoord, 0B11011000); // Extract X component + shufpd(vHCoord, vWCoord, 0B00000010); // Extract Y component shufpd(vHCoord, vHCoord, 0B00000001); } L(lGridShift); if (dataTypeSize > 1) - sal(rAux, dataTypeShift); // Multiply by source data type size. + sal(rAux, dataTypeShift); // Multiply by source data type size. add(regGrid, rAux); L(lEnd); } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::denormalizeRawCoordinates(const Vmm& vWCoord, const Vmm& vHCoord) { if (jcp.alignCorners) { if (vWDenormCoefF.isInitialized()) { @@ -640,7 +644,7 @@ void GridSampleKernel::denormalizeRawCoordinates(const Vmm& vWCoord, const halfHolder = getVmm(); vHalfTmp = halfHolder; static const float halfValues[x64::cpu_isa_traits::vlen / sizeof(float)] = - { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; mov(rAux, reinterpret_cast(halfValues)); uni_vmovups(vHalfTmp, ptr[rAux]); } @@ -671,14 +675,14 @@ void GridSampleKernel::denormalizeRawCoordinates(const Vmm& vWCoord, const template <> void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) { - vcmpps(kDst, vCoord, vSrcWidthF, CMP_LT_PS); // vCoord < vUpperBound - vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros + vcmpps(kDst, vCoord, vSrcWidthF, CMP_LT_PS); // vCoord < vUpperBound + vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros } template <> void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, const Vmask& kMaskW) { - vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, CMP_LT_PS); // vCoord < vUpperBound - vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros + vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, CMP_LT_PS); // vCoord < vUpperBound + vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros } template <> @@ -692,7 +696,7 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& v auto vAux = getVmm(); if (vSrcWidthF.isInitialized()) { - uni_vcmpps(vAux, vWCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vWCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); @@ -700,8 +704,8 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& v } uni_vpxor(kDst, kDst, kDst); - uni_vcmpps(kDst, kDst, vWCoord, CMP_LE_PS); // vWCoord >= vZeros - uni_vpand(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF + uni_vcmpps(kDst, kDst, vWCoord, CMP_LE_PS); // vWCoord >= vZeros + uni_vpand(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF } template <> @@ -709,18 +713,18 @@ void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& v auto vAux = getVmm(); if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux, vHCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vHCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux, vHCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vHCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF } uni_vmovups(kDst, kMaskW); - uni_vpand(kDst, kDst, vAux); // vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF + uni_vpand(kDst, kDst, vAux); // vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF uni_vpxor(vAux, vAux, vAux); - uni_vcmpps(vAux, vAux, vHCoord, CMP_LE_PS); // vHCoord >= vZeros - uni_vpand(kDst, kDst, vAux); // vZeros <= vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF + uni_vcmpps(vAux, vAux, vHCoord, CMP_LE_PS); // vHCoord >= vZeros + uni_vpand(kDst, kDst, vAux); // vZeros <= vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF } template <> @@ -729,7 +733,7 @@ void GridSampleKernel::zerosPadding(const Vmask& kDst, const Vmm& vH zerosPaddingH(kDst, vHCoord, kDst); } -template // Works for AVX2, AVX +template // Works for AVX2, AVX void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) { auto vAux = getVmm(); Vmm vZerosTmp; @@ -743,18 +747,18 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) } if (vSrcWidthF.isInitialized()) { - uni_vcmpps(vAux, vCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vWCoord < vSrcWidthF } - uni_vcmpps(kDst, vZerosTmp, vCoord, CMP_LE_PS); // vWCoord >= vZeros - uni_vandps(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF + uni_vcmpps(kDst, vZerosTmp, vCoord, CMP_LE_PS); // vWCoord >= vZeros + uni_vandps(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF } -template // Works for AVX2, AVX +template // Works for AVX2, AVX void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, const Vmask& kMaskW) { auto vAux = getVmm(); Vmm vZerosTmp; @@ -768,19 +772,19 @@ void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, } if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux, vCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF } uni_vandps(kDst, kMaskW, vAux); - uni_vcmpps(vAux, vZerosTmp, vCoord, CMP_LE_PS); // vHCoord >= vZeros + uni_vcmpps(vAux, vZerosTmp, vCoord, CMP_LE_PS); // vHCoord >= vZeros uni_vandps(kDst, kDst, vAux); } -template // Works for AVX2, AVX +template // Works for AVX2, AVX void GridSampleKernel::zerosPadding(const Vmask& kDst, const Vmm& vHCoord, const Vmm& vWCoord) { bool releaseZeroVec = false; if (!vZeros.isInitialized()) { @@ -799,11 +803,14 @@ void GridSampleKernel::zerosPadding(const Vmask& kDst, const Vmm& vHCoord, template <> void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { - vrangeps(vCoordDst, vCoordOrigin, dim == coord::w ? vSrcWidthSub1F : vSrcHeightSub1F, 0x0); // vWCoord >= vSrcWidthF - vrangeps(vCoordDst, vCoordDst, vZeros, 0x1); // vWCoord < vZeros + vrangeps(vCoordDst, + vCoordOrigin, + dim == coord::w ? vSrcWidthSub1F : vSrcHeightSub1F, + 0x0); // vWCoord >= vSrcWidthF + vrangeps(vCoordDst, vCoordDst, vZeros, 0x1); // vWCoord < vZeros } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { auto rAux = getReg64(); auto vAux = getVmm(); @@ -836,7 +843,7 @@ void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor uni_vaddps(vCoordDst, vCoordDst, vAux); if (vZeros.isInitialized()) { - uni_vcmpps(vAux, vCoordDst, vZeros, 0x6); // vCoord >= vZeros + uni_vcmpps(vAux, vCoordDst, vZeros, 0x6); // vCoord >= vZeros } else { if (isa == x64::sse41) { if (!vAux1.isInitialized()) { @@ -844,27 +851,29 @@ void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor vSub1F = vAux1; } uni_vpxor(vSub1F, vSub1F, vSub1F); - uni_vcmpps(vAux, vCoordDst, vSub1F, 0x6); // vCoord >= vZeros + uni_vcmpps(vAux, vCoordDst, vSub1F, 0x6); // vCoord >= vZeros } else { uni_vpxor(vAux, vAux, vAux); - uni_vcmpps(vAux, vCoordDst, vAux, 0x6); // vCoord >= vZeros + uni_vcmpps(vAux, vCoordDst, vAux, 0x6); // vCoord >= vZeros } } uni_vandps(vCoordDst, vCoordDst, vAux); } template <> -void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { +void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, + const Vmm& vCoordOrigin, + const coord dim) { auto vAux = getVmm(); auto kAux = getMask(); const auto& vSrcDimMul2Sub1F = dim == coord::w ? vSrcWidthMul2Sub1F : vSrcHeightMul2Sub1F; if (jcp.alignCorners) { // abs(x) % D21 - uni_vandps(vCoordDst, vCoordOrigin, vAbsMask); // abs(x) + uni_vandps(vCoordDst, vCoordOrigin, vAbsMask); // abs(x) uni_vdivps(vAux, vCoordDst, vSrcDimMul2Sub1F); - uni_vroundps(vAux, vAux, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21 + uni_vroundps(vAux, vAux, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21 // Check that the result does not exceed the divisor. vcmpps(kAux, vSrcDimMul2Sub1F, vCoordDst, CMP_LE_PS); @@ -876,12 +885,12 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, if (vCoordDst.getIdx() != vCoordOrigin.getIdx()) uni_vmovups(vCoordDst, vCoordOrigin); uni_vdivps(vAux, vCoordDst, vSrcDimMul2F); - uni_vroundps(vAux, vAux, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // x % D2 - uni_vaddps(vCoordDst, vCoordDst, vSrcDimMul2F); // x % D2 + D2 + uni_vroundps(vAux, vAux, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // x % D2 + uni_vaddps(vCoordDst, vCoordDst, vSrcDimMul2F); // x % D2 + D2 uni_vdivps(vAux, vCoordDst, vSrcDimMul2F); - uni_vroundps(vAux, vAux, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // (x % D2 + D2) % D2 + uni_vroundps(vAux, vAux, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // (x % D2 + D2) % D2 // Check that the result does not exceed the divisor. vcmpps(kAux, vSrcDimMul2F, vCoordDst, CMP_LE_PS); @@ -890,13 +899,13 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, } uni_vsubps(vAux, vSrcDimMul2Sub1F, vCoordDst); - vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, CMP_LE_PS); // vCoordDst >= vSrcDimF + vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, CMP_LE_PS); // vCoordDst >= vSrcDimF uni_vmovups(vCoordDst | kAux, vAux); } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { - auto rAux = getReg64(); + auto rAux = getReg64(); auto vAux0 = getVmm(); auto vAux1 = getVmm(); @@ -904,14 +913,15 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v // D21 = (Dim - 1) * 2 if (jcp.alignCorners) { // x' = abs(x) % D21 - D21 - static const unsigned absMask[8] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; - if (isa ==x64::sse41) { - static const unsigned *absPtr = absMask + (reinterpret_cast(absMask) % 16) / sizeof(unsigned); + static const unsigned absMask[8] = + {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; + if (isa == x64::sse41) { + static const unsigned* absPtr = absMask + (reinterpret_cast(absMask) % 16) / sizeof(unsigned); mov(rAux, reinterpret_cast(absPtr)); } else { mov(rAux, reinterpret_cast(absMask)); } - uni_vandps(vCoordDst, vCoordOrigin, ptr[rAux]); // abs(x) + uni_vandps(vCoordDst, vCoordOrigin, ptr[rAux]); // abs(x) Vmm vMul2Sub1; if (dim == coord::w) { @@ -932,8 +942,8 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v } } uni_vdivps(vAux0, vCoordDst, vMul2Sub1); - uni_vroundps(vAux0, vAux0, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux0, vMul2Sub1); // abs(x) % D21 + uni_vroundps(vAux0, vAux0, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux0, vMul2Sub1); // abs(x) % D21 // Check that the result does not exceed the divisor. uni_vcmpps(vAux0, vCoordDst, vMul2Sub1, CMP_LT_PS); @@ -942,7 +952,7 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v uni_vcmpps(vAux0, vAux0, vCoordDst, CMP_LE_PS); uni_vandps(vCoordDst, vCoordDst, vAux0); - uni_vsubps(vAux0, vCoordDst, vMul2Sub1); // abs(x) % D21 - D21 + uni_vsubps(vAux0, vCoordDst, vMul2Sub1); // abs(x) % D21 - D21 } else { // x' = (x % D2 + D2) % D2 - D21 if (vCoordDst.getIdx() != vCoordOrigin.getIdx()) @@ -966,12 +976,12 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v } } uni_vdivps(vAux0, vCoordOrigin, vMul2); - uni_vroundps(vAux0, vAux0, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // x % D2 - uni_vaddps(vCoordDst, vCoordDst, vMul2); // x % D2 + D2 + uni_vroundps(vAux0, vAux0, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // x % D2 + uni_vaddps(vCoordDst, vCoordDst, vMul2); // x % D2 + D2 uni_vdivps(vAux0, vCoordDst, vMul2); - uni_vroundps(vAux0, vAux0, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // (x % D2 + D2) % D2 + uni_vroundps(vAux0, vAux0, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // (x % D2 + D2) % D2 // Check that the result does not exceed the divisor. uni_vcmpps(vAux0, vCoordDst, vMul2, CMP_LT_PS); @@ -1002,20 +1012,20 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v uni_vcmpps(vAux1, vCoordDst, vSrcWidthF, CMP_LT_PS); // vCoordDst < vUpperBound } else { mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); - uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound } } else { if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, CMP_LT_PS); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, CMP_LT_PS); // vCoordDst < vUpperBound } else { mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound } } uni_vandps(vCoordDst, vCoordDst, vAux1); uni_vandnps(vAux1, vAux1, vAux0); - uni_vsubps(vCoordDst, vCoordDst, vAux1); // set -x' for vCoordDst >= Dim + uni_vsubps(vCoordDst, vCoordDst, vAux1); // set -x' for vCoordDst >= Dim } template <> @@ -1045,12 +1055,13 @@ void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, c template <> void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vmm& vDDim, const uint8_t idx) { - static const size_t elPerVec = x64::cpu_isa_traits::vlen / sizeof(float);; - static const float const_0_75[elPerVec] = { -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f }; - static const float const_1_25[elPerVec] = { 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f }; - static const float const_1_50[elPerVec] = { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f }; - static const float const_2_00[elPerVec] = { 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f }; - static const float const_2_25[elPerVec] = { 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f }; + static const size_t elPerVec = x64::cpu_isa_traits::vlen / sizeof(float); + ; + static const float const_0_75[elPerVec] = {-0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f}; + static const float const_1_25[elPerVec] = {1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f}; + static const float const_1_50[elPerVec] = {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}; + static const float const_2_00[elPerVec] = {2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}; + static const float const_2_25[elPerVec] = {2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f}; auto rAux = getReg64(); @@ -1088,11 +1099,11 @@ void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vm template <> void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vmm& vDDim, const uint8_t idx) { static const size_t elPerVec = x64::cpu_isa_traits::vlen / sizeof(float); - static const float const_0_75[elPerVec] = { -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f }; - static const float const_1_25[elPerVec] = { 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f }; - static const float const_1_50[elPerVec] = { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f }; - static const float const_2_00[elPerVec] = { 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f }; - static const float const_2_25[elPerVec] = { 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f }; + static const float const_0_75[elPerVec] = {-0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f}; + static const float const_1_25[elPerVec] = {1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f}; + static const float const_1_50[elPerVec] = {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}; + static const float const_2_00[elPerVec] = {2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}; + static const float const_2_25[elPerVec] = {2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f}; auto rAux = getReg64(); auto vAux = getVmm(); @@ -1136,11 +1147,11 @@ template <> void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vmm& vDDim, const uint8_t idx) { static const size_t elToAllocate = 2 * x64::cpu_isa_traits::vlen / sizeof(float); // Allocation with a margin for address alignment. - static const float c_0_75[elToAllocate] = { -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f }; - static const float c_1_25[elToAllocate] = { 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f }; - static const float c_1_50[elToAllocate] = { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f }; - static const float c_2_00[elToAllocate] = { 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f }; - static const float c_2_25[elToAllocate] = { 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f }; + static const float c_0_75[elToAllocate] = {-0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f}; + static const float c_1_25[elToAllocate] = {1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f}; + static const float c_1_50[elToAllocate] = {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}; + static const float c_2_00[elToAllocate] = {2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}; + static const float c_2_25[elToAllocate] = {2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f}; // Address alignment for XMM. static const float* const_0_75 = c_0_75 + (reinterpret_cast(c_0_75) % 16) / sizeof(float); static const float* const_1_25 = c_1_25 + (reinterpret_cast(c_1_25) % 16) / sizeof(float); @@ -1193,15 +1204,15 @@ void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const V } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::nearestInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { const auto& vSrcShift = vWCoord; - const auto& vAux = vHCoord; - auto kGatherMask = getMask(); - auto kAuxMask = getMask(); + const auto& vAux = vHCoord; + auto kGatherMask = getMask(); + auto kAuxMask = getMask(); - uni_vroundps(vWCoord, vWCoord, 0x0); // Round near - uni_vroundps(vHCoord, vHCoord, 0x0); // Round near + uni_vroundps(vWCoord, vWCoord, 0x0); // Round near + uni_vroundps(vHCoord, vHCoord, 0x0); // Round near bool useMask = false, zeroFill = false; if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { @@ -1272,15 +1283,15 @@ template <> void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { const auto& vDX = vWCoord; const auto& vDY = vHCoord; - auto shift00 = getVmm(); - auto shift01 = getVmm(); - auto shift10 = getVmm(); - auto shift11 = getVmm(); - auto vAux = getVmm(); + auto shift00 = getVmm(); + auto shift01 = getVmm(); + auto shift10 = getVmm(); + auto shift11 = getVmm(); + auto vAux = getVmm(); RegistersPool::Reg kMask00, kMask01, kMask10, kMask11; - uni_vroundps(shift00, vWCoord, 0x1); // Round floor - uni_vroundps(shift01, vHCoord, 0x1); // Round floor + uni_vroundps(shift00, vWCoord, 0x1); // Round floor + uni_vroundps(shift01, vHCoord, 0x1); // Round floor uni_vsubps(vDX, vWCoord, shift00); uni_vsubps(vDY, vHCoord, shift01); uni_vaddps(shift10, shift00, vOnesF); @@ -1294,10 +1305,10 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor kMask10 = getMask(); kMask11 = getMask(); - zerosPadding(kMask00, shift01, shift00); // (y; x) - zerosPadding(kMask01, shift01, shift10); // (y; x + 1) - zerosPadding(kMask11, shift11, shift10); // (y + 1; x + 1) - zerosPadding(kMask10, shift11, shift00); // (y + 1; x) + zerosPadding(kMask00, shift01, shift00); // (y; x) + zerosPadding(kMask01, shift01, shift10); // (y; x + 1) + zerosPadding(kMask11, shift11, shift10); // (y + 1; x + 1) + zerosPadding(kMask10, shift11, shift00); // (y + 1; x) hwShiftPs2dq(shift00, shift01, shift00, vSrcWidthF); uni_vpaddd(shift01, shift00, vDataTypeSizeB); @@ -1330,8 +1341,8 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; RegistersPool::Reg rChannel; - auto rSrcTmp = getReg64(); - auto rDstTmp = getReg64(); + auto rSrcTmp = getReg64(); + auto rDstTmp = getReg64(); mov(rSrcTmp, regSrc); mov(rDstTmp, regDst); @@ -1349,11 +1360,11 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { kmovw(kAuxMask, kMask00); } - gatherdd(vQ0, rSrcTmp, shift00, kAuxMask, useMask, zeroFill); // v00 -> vQ0 + gatherdd(vQ0, rSrcTmp, shift00, kAuxMask, useMask, zeroFill); // v00 -> vQ0 if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vQ0, vQ0); } - uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) + uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) // (y; x + 1) if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { @@ -1363,7 +1374,7 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vAux, vAux); } - uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 + uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 // (y + 1; x + 1) if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { @@ -1383,14 +1394,14 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor uni_vcvtdq2ps(vQ1, vQ1); } - uni_vfmsub213ps(vQ1, vDX, vQ1); // q1 = -(v10 - dx * v10) - uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 + uni_vfmsub213ps(vQ1, vDX, vQ1); // q1 = -(v10 - dx * v10) + uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 // Res = q0 + dy * (q1 - q0) uni_vsubps(vQ1, vQ1, vQ0); uni_vfmadd132ps(vQ1, vQ0, vDY); if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vQ1, vQ1, 0x3); // Truncation + uni_vroundps(vQ1, vQ1, 0x3); // Truncation uni_vcvtps2dq(vQ1, vQ1); } @@ -1410,20 +1421,20 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor } } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { auto vWRound = getVmm(); auto vHRound = getVmm(); - auto& vDX = vWCoord; - auto& vDY = vHCoord; - auto vAux = getVmm(); + auto& vDX = vWCoord; + auto& vDY = vHCoord; + auto vAux = getVmm(); Vmm shift00, shift01, shift10, shift11; RegistersPool::Reg shift10Holder, shift11Holder; // For ZEROS padding only. RegistersPool::Reg vMask00, vMask01, vMask10, vMask11; - uni_vroundps(vWRound, vWCoord, 0x1); // Round floor - uni_vroundps(vHRound, vHCoord, 0x1); // Round floor + uni_vroundps(vWRound, vWCoord, 0x1); // Round floor + uni_vroundps(vHRound, vHCoord, 0x1); // Round floor uni_vsubps(vDX, vDX, vWRound); uni_vsubps(vDY, vDY, vHRound); @@ -1444,9 +1455,9 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& useMask = zeroFill = true; { auto rAux = getReg64(); - static const float onesArr[8] = { 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f }; - if (isa ==x64::sse41) { - static const float *onesPtr = onesArr + (reinterpret_cast(onesArr) % 16) / sizeof(float); + static const float onesArr[8] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}; + if (isa == x64::sse41) { + static const float* onesPtr = onesArr + (reinterpret_cast(onesArr) % 16) / sizeof(float); mov(rAux, reinterpret_cast(onesPtr)); } else { mov(rAux, reinterpret_cast(onesArr)); @@ -1463,10 +1474,10 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& uni_vaddps(vMask00, vWRound, vAux); uni_vaddps(vAux, vAux, vHRound); - zerosPadding(vMask01, vHRound, vMask00); // (y; x + 1) - zerosPadding(vMask10, vAux, vWRound); // (y + 1; x) - zerosPadding(vMask11, vAux, vMask00); // (y + 1; x + 1) - zerosPadding(vMask00, vHRound, vWRound); // (y; x) + zerosPadding(vMask01, vHRound, vMask00); // (y; x + 1) + zerosPadding(vMask10, vAux, vWRound); // (y + 1; x) + zerosPadding(vMask11, vAux, vMask00); // (y + 1; x + 1) + zerosPadding(vMask00, vHRound, vWRound); // (y; x) hwShiftPs2dq(shift00, vHRound, vWRound, vSrcWidthF); } else if (jcp.paddingMode == GridSamplePaddingMode::BORDER) { @@ -1490,17 +1501,17 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& } auto vGatherMask = getVmm(); - auto vQ0 = getVmm(); - auto vQ1 = getVmm(); + auto vQ0 = getVmm(); + auto vQ1 = getVmm(); // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; RegistersPool::Reg rChannel; - auto rSrcTmp = getReg64(); - auto rDstTmp = getReg64(); + auto rSrcTmp = getReg64(); + auto rDstTmp = getReg64(); auto rTypeSize = getReg64(); - mov(rSrcTmp, regSrc); - mov(rDstTmp, regDst); + mov(rSrcTmp, regSrc); + mov(rDstTmp, regDst); mov(rTypeSize, ptr[regParams + GET_OFF(dataTypeSize)]); for (uint64_t ch = 0; ch < jcp.cannelNum; ch++) { @@ -1517,12 +1528,17 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (jcp.paddingMode == GridSamplePaddingMode::ZEROS && isa == x64::avx2) { uni_vmovups(vGatherMask, vMask00); } - gatherdd(vQ0, rSrcTmp, shift00, (isa == x64::avx2 || !vMask00.isInitialized()) ? vGatherMask : vMask00, useMask, zeroFill); // v00 -> vQ0 + gatherdd(vQ0, + rSrcTmp, + shift00, + (isa == x64::avx2 || !vMask00.isInitialized()) ? vGatherMask : vMask00, + useMask, + zeroFill); // v00 -> vQ0 if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vQ0, vQ0); } if (isa == x64::avx2) { - uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) + uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) } else { uni_vmulps(vGatherMask, vQ0, vDX); uni_vsubps(vQ0, vQ0, vGatherMask); @@ -1534,13 +1550,17 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (isa == x64::avx2) uni_vmovups(vGatherMask, vMask01); } - gatherdd(vAux, rSrcTmp, jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift01 : shift10, - (isa == x64::avx2 || !vMask01.isInitialized()) ? vGatherMask : vMask01, useMask, zeroFill); + gatherdd(vAux, + rSrcTmp, + jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift01 : shift10, + (isa == x64::avx2 || !vMask01.isInitialized()) ? vGatherMask : vMask01, + useMask, + zeroFill); if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vAux, vAux); } if (isa == x64::avx2) { - uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 + uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 } else { uni_vmulps(vAux, vAux, vDX); uni_vaddps(vQ0, vQ0, vAux); @@ -1556,8 +1576,12 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (isa == x64::avx2) uni_vmovups(vGatherMask, vMask11); } - gatherdd(vAux, rSrcTmp, jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift11 : shift10, - (isa == x64::avx2 || !vMask11.isInitialized()) ? vGatherMask : vMask11, useMask, zeroFill); + gatherdd(vAux, + rSrcTmp, + jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift11 : shift10, + (isa == x64::avx2 || !vMask11.isInitialized()) ? vGatherMask : vMask11, + useMask, + zeroFill); if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vAux, vAux); } @@ -1568,7 +1592,12 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (isa == x64::avx2) uni_vmovups(vGatherMask, vMask10); } - gatherdd(vQ1, rSrcTmp, shift10, (isa == x64::avx2 || !vMask10.isInitialized()) ? vGatherMask : vMask10, useMask, zeroFill); + gatherdd(vQ1, + rSrcTmp, + shift10, + (isa == x64::avx2 || !vMask10.isInitialized()) ? vGatherMask : vMask10, + useMask, + zeroFill); if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vQ1, vQ1); } @@ -1585,13 +1614,13 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& uni_vmovups(vQ1, vGatherMask); } } - uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 + uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 // Res = q0 + dy * (q1 - q0) uni_vsubps(vQ1, vQ1, vQ0); uni_vfmadd132ps(vQ1, vQ0, vDY); if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vQ1, vQ1, 0x3); // Truncation + uni_vroundps(vQ1, vQ1, 0x3); // Truncation uni_vcvtps2dq(vQ1, vQ1); } @@ -1614,27 +1643,27 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& template <> void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { - auto vHTop = getVmm(); - auto vWLeft = getVmm(); - auto vDX = getVmm(); - auto vDY = getVmm(); - auto vXDotProd = getVmm(); + auto vHTop = getVmm(); + auto vWLeft = getVmm(); + auto vDX = getVmm(); + auto vDY = getVmm(); + auto vXDotProd = getVmm(); auto& vYDotProd = vDX; auto vSrcShift0 = getVmm(); - auto vSrcShift = getVmm(); - auto vAux = getVmm(); - auto kAuxMask = getMask(); + auto vSrcShift = getVmm(); + auto vAux = getVmm(); + auto kAuxMask = getMask(); RegistersPool::Reg kMaskH; std::vector> wMasks; - uni_vroundps(vHTop, vHCoord, 0x1); // Round floor - uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor + uni_vroundps(vHTop, vHCoord, 0x1); // Round floor + uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor uni_vsubps(vDY, vHCoord, vHTop); uni_vsubps(vDX, vWCoord, vWLeft); uni_vsubps(vHTop, vHTop, vOnesF); uni_vsubps(vWLeft, vWLeft, vOnesF); - RegistersPool::Reg vCX[4] = {getVmm(), getVmm(), getVmm(), getVmm() }; + RegistersPool::Reg vCX[4] = {getVmm(), getVmm(), getVmm(), getVmm()}; for (int i = 0; i < 4; i++) { bicubicCoefficients(vCX[i], vDX, i); } @@ -1659,8 +1688,8 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; RegistersPool::Reg rChannel; - auto rSrcTmp = getReg64(); - auto rDstTmp = getReg64(); + auto rSrcTmp = getReg64(); + auto rDstTmp = getReg64(); mov(rSrcTmp, regSrc); mov(rDstTmp, regDst); @@ -1742,7 +1771,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord } if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation + uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation uni_vcvtps2dq(vYDotProd, vYDotProd); } @@ -1762,15 +1791,15 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord } } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { - auto vHTop = getVmm(); + auto vHTop = getVmm(); auto vWLeft = getVmm(); - auto vDX = getVmm(); - auto vDY = getVmm(); + auto vDX = getVmm(); + auto vDY = getVmm(); - uni_vroundps(vHTop, vHCoord, 0x1); // Round floor - uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor + uni_vroundps(vHTop, vHCoord, 0x1); // Round floor + uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor uni_vsubps(vDY, vHCoord, vHTop); uni_vsubps(vDX, vWCoord, vWLeft); uni_vsubps(vHTop, vHTop, vOnesF); @@ -1791,7 +1820,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } auto vW0 = getVmm(), vW1 = getVmm(); - Vmm vW[4] = { vW0, vW1, vHCoord, vWCoord }; + Vmm vW[4] = {vW0, vW1, vHCoord, vWCoord}; for (int w = 0; w < 4; w++) { borderPadding(vW[w], vWLeft, coord::w); if (w < 3) { @@ -1806,7 +1835,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& mov(rAux, ptr[regParams + GET_OFF(srcHeightSub1F)]); uni_vmovups(vSrcHeightSub1F, ptr[rAux]); } - auto vH = getVmm(); + auto vH = getVmm(); size_t bufShift = 0lu; for (int h = 0; h < 4; h++) { @@ -1839,7 +1868,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } auto vW0 = getVmm(), vW1 = getVmm(); - Vmm vW[4] = { vW0, vW1, vHCoord, vWCoord }; + Vmm vW[4] = {vW0, vW1, vHCoord, vWCoord}; for (int w = 0; w < 4; w++) { reflectionPadding(vW[w], vWLeft, coord::w); if (w < 3) { @@ -1860,7 +1889,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& mov(rAux, ptr[regParams + GET_OFF(srcHeightMul2Sub1F)]); uni_vmovups(vSrcHeightMul2Sub1F, ptr[rAux]); } - auto vH = getVmm(); + auto vH = getVmm(); size_t bufShift = 0lu; for (int h = 0; h < 4; h++) { @@ -1883,7 +1912,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } else if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { useMask = zeroFill = true; - RegistersPool::Reg vWMask[4] = { getVmm(), getVmm(), getVmm(), getVmm() }; + RegistersPool::Reg vWMask[4] = {getVmm(), getVmm(), getVmm(), getVmm()}; for (int w = 0; w < 4; w++) { if (w == 0) { zerosPaddingW(vWMask[w], vWLeft); @@ -1933,21 +1962,21 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& vDataTypeSizeB.release(); } - RegistersPool::Reg vCX[4] = { getVmm(), getVmm(), getVmm(), getVmm() }; + RegistersPool::Reg vCX[4] = {getVmm(), getVmm(), getVmm(), getVmm()}; for (int w = 0; w < 4; w++) { bicubicCoefficients(vCX[w], vDX, w); } auto vCY0 = getVmm(), vCY1 = getVmm(); - Vmm vCY[4] = { vCY0, vCY1, vHCoord, vWCoord }; + Vmm vCY[4] = {vCY0, vCY1, vHCoord, vWCoord}; for (int h = 0; h < 4; h++) { bicubicCoefficients(vCY[h], vDY, h); } const auto& vXDotProd = vDX; const auto& vYDotProd = vDY; - auto vSrcShift = getVmm(); + auto vSrcShift = getVmm(); auto kGatherMask = getVmm(); - auto vAux = getVmm(); + auto vAux = getVmm(); // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; @@ -2003,7 +2032,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation + uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation uni_vcvtps2dq(vYDotProd, vYDotProd); } @@ -2028,7 +2057,7 @@ void GridSampleKernel::dataTypeShiftPs2Dq(const Vmm& vDst, const Vmm& vSrc) if (dataTypeSize == 1) return; - if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM + if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM auto rAux = getReg64(); static const float val = dataTypeSize; static const float dataTypeSizeArr[8] = {val, val, val, val, val, val, val, val}; @@ -2038,7 +2067,7 @@ void GridSampleKernel::dataTypeShiftPs2Dq(const Vmm& vDst, const Vmm& vSrc) } else { uni_vcvtps2dq(vDst, vSrc); if (dataTypeSize > 1) - uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. + uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. } } @@ -2066,7 +2095,7 @@ void GridSampleKernel::hwShiftPs2dq(const Vmm& vDst, const Vmm& vHCoord, co } } - if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM + if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM if (dataTypeSize > 1) { auto rAux = getReg64(); const float val = dataTypeSize; @@ -2078,7 +2107,7 @@ void GridSampleKernel::hwShiftPs2dq(const Vmm& vDst, const Vmm& vHCoord, co } else { uni_vcvtps2dq(vDst, vDst); if (dataTypeSize > 1) - uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. + uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. } } @@ -2086,6 +2115,6 @@ template class GridSampleKernel; template class GridSampleKernel; template class GridSampleKernel; -} // namespace kernel -} // namespace intel_cpu -} // namespace ov +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp index cb13d62c3509d1..f276580a837bd2 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp @@ -4,9 +4,10 @@ #pragma once -#include "jit_kernel_base.hpp" #include +#include "jit_kernel_base.hpp" + namespace ov { namespace intel_cpu { @@ -20,16 +21,16 @@ class GridSampleKernelBase; #if defined(OPENVINO_ARCH_X86_64) struct GridSampleKernelConfParams { - bool dynamicShapes = false; - bool dynamicBatch = false; + bool dynamicShapes = false; + bool dynamicBatch = false; bool dynamicChannel = false; - bool alignCorners = false; + bool alignCorners = false; GridSampleInterpolationMode interpolationMode = GridSampleInterpolationMode::BILINEAR; GridSamplePaddingMode paddingMode = GridSamplePaddingMode::ZEROS; ov::element::Type inDataPrc; ov::element::Type gridPrc; - uint64_t batchNum = 1lu; - uint64_t cannelNum = 1lu; + uint64_t batchNum = 1lu; + uint64_t cannelNum = 1lu; uint64_t srcBatchStepB = 0lu; }; @@ -37,13 +38,13 @@ struct GridSamplesKernelExecArgs { const void* src; const void* grid; void* dst; - uint64_t batchNum = 1lu; + uint64_t batchNum = 1lu; uint64_t channelsNum = 1lu; const float* srcWidthF; const float* srcHeightF; - uint64_t srcBatchStepB = 0lu; - uint64_t gridBatchStepB = 0lu; - uint64_t dstBatchStepB = 0lu; + uint64_t srcBatchStepB = 0lu; + uint64_t gridBatchStepB = 0lu; + uint64_t dstBatchStepB = 0lu; uint64_t srcChannelStepB = 0lu; uint64_t dstChannelStepB = 0lu; const void* wDenormCoefF; @@ -60,19 +61,21 @@ struct GridSamplesKernelExecArgs { uint64_t workAmount = 0lu; }; -enum coord { - w, h -}; +enum coord { w, h }; -class GridSampleKernelBase: public JitKernelBase { +class GridSampleKernelBase : public JitKernelBase { public: - void (*ker_)(const GridSamplesKernelExecArgs *); - void operator()(const GridSamplesKernelExecArgs *args) { + void (*ker_)(const GridSamplesKernelExecArgs*); + void operator()(const GridSamplesKernelExecArgs* args) { assert(ker_); ker_(args); } - explicit GridSampleKernelBase(const char* name, const GridSampleKernelConfParams& jcp, dnnl::impl::cpu::x64::cpu_isa_t isa) - : JitKernelBase(name, isa), ker_(nullptr), jcp(jcp) {} + explicit GridSampleKernelBase(const char* name, + const GridSampleKernelConfParams& jcp, + dnnl::impl::cpu::x64::cpu_isa_t isa) + : JitKernelBase(name, isa), + ker_(nullptr), + jcp(jcp) {} virtual void create_ker() = 0; uint64_t getVecLen() { @@ -87,7 +90,7 @@ class GridSampleKernelBase: public JitKernelBase { protected: GridSampleKernelConfParams jcp; - uint64_t vlen = 16lu; + uint64_t vlen = 16lu; uint64_t dataTypeSize = 1lu; uint64_t gridTypeSize = 1lu; uint64_t dataElPerVec = 1lu; @@ -104,12 +107,16 @@ class GridSampleKernel : public GridSampleKernelBase { void create_ker() override; void generate() override; - using Vmm = typename dnnl::impl::utils::conditional3::type; - using Vmask = typename dnnl::impl::utils::conditional3::type; + using Vmm = typename dnnl::impl::utils::conditional3::type; + using Vmask = typename dnnl::impl::utils::conditional3::type; private: uint8_t dataTypeShift = 0; @@ -138,23 +145,23 @@ class GridSampleKernel : public GridSampleKernelBase { RegistersPool::Reg vWDenormCoefF; RegistersPool::Reg vHDenormCoefF; RegistersPool::Reg vGridPermMask; - RegistersPool::Reg vDataTypeSizeB; // for ZEROS padding - RegistersPool::Reg vSrcWidthB; // for ZEROS padding + RegistersPool::Reg vDataTypeSizeB; // for ZEROS padding + RegistersPool::Reg vSrcWidthB; // for ZEROS padding - RegistersPool::Reg vSrcHeightSub1F; // for BORDER padding - RegistersPool::Reg vSrcWidthSub1F; // for BORDER padding + RegistersPool::Reg vSrcHeightSub1F; // for BORDER padding + RegistersPool::Reg vSrcWidthSub1F; // for BORDER padding - RegistersPool::Reg vSrcHeightMul2F; // for REFLECTION padding - RegistersPool::Reg vSrcWidthMul2F; // for REFLECTION padding - RegistersPool::Reg vSrcHeightMul2Sub1F; // for REFLECTION padding - RegistersPool::Reg vSrcWidthMul2Sub1F; // for REFLECTION padding - RegistersPool::Reg vAbsMask; // for REFLECTION padding + RegistersPool::Reg vSrcHeightMul2F; // for REFLECTION padding + RegistersPool::Reg vSrcWidthMul2F; // for REFLECTION padding + RegistersPool::Reg vSrcHeightMul2Sub1F; // for REFLECTION padding + RegistersPool::Reg vSrcWidthMul2Sub1F; // for REFLECTION padding + RegistersPool::Reg vAbsMask; // for REFLECTION padding - RegistersPool::Reg vConst_0_75; // for BICUBIC interpolation - RegistersPool::Reg vConst_1_25; // for BICUBIC interpolation - RegistersPool::Reg vConst_1_50; // for BICUBIC interpolation - RegistersPool::Reg vConst_2_00; // for BICUBIC interpolation - RegistersPool::Reg vConst_2_25; // for BICUBIC interpolation + RegistersPool::Reg vConst_0_75; // for BICUBIC interpolation + RegistersPool::Reg vConst_1_25; // for BICUBIC interpolation + RegistersPool::Reg vConst_1_50; // for BICUBIC interpolation + RegistersPool::Reg vConst_2_00; // for BICUBIC interpolation + RegistersPool::Reg vConst_2_25; // for BICUBIC interpolation void initVectors(); void process(); @@ -179,8 +186,8 @@ class GridSampleKernel : public GridSampleKernelBase { void hwShiftPs2dq(const Vmm& vDst, const Vmm& vHCoord, const Vmm& vWCoord, const Vmm& vWidth); }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 -} // namespace kernel -} // namespace intel_cpu -} // namespace ov +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp index cd8b32d9ad2a38..2eb981007f2217 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp @@ -3,9 +3,10 @@ // #include "jit_kernel.hpp" -#include -#include + #include +#include +#include #include using namespace dnnl::impl; @@ -17,16 +18,16 @@ namespace intel_cpu { namespace { -template +template using registers = std::array, 16>; bool isRegAllocable(int id) { - return id != abi_param1.getIdx() // function argument - && id != Operand::Code::RSP; // stack pointer + return id != abi_param1.getIdx() // function argument + && id != Operand::Code::RSP; // stack pointer } -template -const RegType & reserveReg(jit_kernel::reg_indices & freeRegs, const registers & regs) { +template +const RegType& reserveReg(jit_kernel::reg_indices& freeRegs, const registers& regs) { if (freeRegs.empty()) throw std::runtime_error("No free registers"); const auto idx = freeRegs.back(); @@ -34,8 +35,8 @@ const RegType & reserveReg(jit_kernel::reg_indices & freeRegs, const registers -void freeReg(jit_kernel::reg_indices & freeRegs, const registers & regs, const RegType & reg) { +template +void freeReg(jit_kernel::reg_indices& freeRegs, const registers& regs, const RegType& reg) { const auto idx = reg.getIdx(); // Debug: // auto it = std::find(freeRegs.begin(), freeRegs.end(), idx); @@ -46,105 +47,189 @@ void freeReg(jit_kernel::reg_indices & freeRegs, const registers & regs OPENVINO_THROW("Some register was freed twice"); } -const registers & x64regs() { +const registers& x64regs() { using namespace Xbyak::util; - static const registers _x64regs {{ - rax, rcx, rdx, rbx, - rsp, rbp, rsi, rdi, - r8, r9, r10, r11, - r12, r13, r14, r15, + static const registers _x64regs{{ + rax, + rcx, + rdx, + rbx, + rsp, + rbp, + rsi, + rdi, + r8, + r9, + r10, + r11, + r12, + r13, + r14, + r15, }}; return _x64regs; } -const registers & x32regs() { +const registers& x32regs() { using namespace Xbyak::util; - static const registers _x32regs {{ - eax, ecx, edx, ebx, - esp, ebp, esi, edi, - r8d, r9d, r10d, r11d, - r12d, r13d, r14d, r15d, + static const registers _x32regs{{ + eax, + ecx, + edx, + ebx, + esp, + ebp, + esi, + edi, + r8d, + r9d, + r10d, + r11d, + r12d, + r13d, + r14d, + r15d, }}; return _x32regs; } -const registers & x16regs() { +const registers& x16regs() { using namespace Xbyak::util; - static const registers _x16regs {{ - ax, cx, dx, bx, - sp, bp, si, di, - r8w, r9w, r10w, r11w, - r12w, r13w, r14w, r15w, + static const registers _x16regs{{ + ax, + cx, + dx, + bx, + sp, + bp, + si, + di, + r8w, + r9w, + r10w, + r11w, + r12w, + r13w, + r14w, + r15w, }}; return _x16regs; } -const registers & x8regs() { +const registers& x8regs() { using namespace Xbyak::util; - static const registers _x8regs {{ - al, cl, dl, bl, - spl, bpl, sil, dil, - r8b, r9b, r10b, r11b, - r12b, r13b, r14b, r15b, + static const registers _x8regs{{ + al, + cl, + dl, + bl, + spl, + bpl, + sil, + dil, + r8b, + r9b, + r10b, + r11b, + r12b, + r13b, + r14b, + r15b, }}; return _x8regs; } -const registers & xmmregs() { - static const registers _xmmregs {{ - Xbyak::util::xmm0, Xbyak::util::xmm1, Xbyak::util::xmm2, Xbyak::util::xmm3, - Xbyak::util::xmm4, Xbyak::util::xmm5, Xbyak::util::xmm6, Xbyak::util::xmm7, - Xbyak::util::xmm8, Xbyak::util::xmm9, Xbyak::util::xmm10, Xbyak::util::xmm11, - Xbyak::util::xmm12, Xbyak::util::xmm13, Xbyak::util::xmm14, Xbyak::util::xmm15, +const registers& xmmregs() { + static const registers _xmmregs{{ + Xbyak::util::xmm0, + Xbyak::util::xmm1, + Xbyak::util::xmm2, + Xbyak::util::xmm3, + Xbyak::util::xmm4, + Xbyak::util::xmm5, + Xbyak::util::xmm6, + Xbyak::util::xmm7, + Xbyak::util::xmm8, + Xbyak::util::xmm9, + Xbyak::util::xmm10, + Xbyak::util::xmm11, + Xbyak::util::xmm12, + Xbyak::util::xmm13, + Xbyak::util::xmm14, + Xbyak::util::xmm15, }}; return _xmmregs; } -const registers & ymmregs() { - static const registers _ymmregs {{ - Xbyak::util::ymm0, Xbyak::util::ymm1, Xbyak::util::ymm2, Xbyak::util::ymm3, - Xbyak::util::ymm4, Xbyak::util::ymm5, Xbyak::util::ymm6, Xbyak::util::ymm7, - Xbyak::util::ymm8, Xbyak::util::ymm9, Xbyak::util::ymm10, Xbyak::util::ymm11, - Xbyak::util::ymm12, Xbyak::util::ymm13, Xbyak::util::ymm14, Xbyak::util::ymm15, +const registers& ymmregs() { + static const registers _ymmregs{{ + Xbyak::util::ymm0, + Xbyak::util::ymm1, + Xbyak::util::ymm2, + Xbyak::util::ymm3, + Xbyak::util::ymm4, + Xbyak::util::ymm5, + Xbyak::util::ymm6, + Xbyak::util::ymm7, + Xbyak::util::ymm8, + Xbyak::util::ymm9, + Xbyak::util::ymm10, + Xbyak::util::ymm11, + Xbyak::util::ymm12, + Xbyak::util::ymm13, + Xbyak::util::ymm14, + Xbyak::util::ymm15, }}; return _ymmregs; } -const registers & zmmregs() { - static const registers _zmmregs {{ - Xbyak::util::zmm0, Xbyak::util::zmm1, Xbyak::util::zmm2, Xbyak::util::zmm3, - Xbyak::util::zmm4, Xbyak::util::zmm5, Xbyak::util::zmm6, Xbyak::util::zmm7, - Xbyak::util::zmm8, Xbyak::util::zmm9, Xbyak::util::zmm10, Xbyak::util::zmm11, - Xbyak::util::zmm12, Xbyak::util::zmm13, Xbyak::util::zmm14, Xbyak::util::zmm15, +const registers& zmmregs() { + static const registers _zmmregs{{ + Xbyak::util::zmm0, + Xbyak::util::zmm1, + Xbyak::util::zmm2, + Xbyak::util::zmm3, + Xbyak::util::zmm4, + Xbyak::util::zmm5, + Xbyak::util::zmm6, + Xbyak::util::zmm7, + Xbyak::util::zmm8, + Xbyak::util::zmm9, + Xbyak::util::zmm10, + Xbyak::util::zmm11, + Xbyak::util::zmm12, + Xbyak::util::zmm13, + Xbyak::util::zmm14, + Xbyak::util::zmm15, }}; return _zmmregs; } -} // namespace +} // namespace namespace internal { -template<> +template <> ov::element::Type type2precision() { return ov::element::f32; } -template<> +template <> ov::element::Type type2precision() { return ov::element::i32; } -template<> +template <> ov::element::Type type2precision() { return ov::element::bf16; } -template<> +template <> ov::element::Type type2precision() { return ov::element::u8; } -template<> +template <> ov::element::Type type2precision() { return ov::element::i8; } @@ -157,27 +242,24 @@ cpu_isa_t get_current_isa() { return cpu_isa_t::sse41; } -stack_frame::stack_frame(ov::intel_cpu::jit_kernel & kernel, size_t size, uint32_t alignment) - : _kernel(kernel) - , _size(size) - , _alignment(alignment) { +stack_frame::stack_frame(ov::intel_cpu::jit_kernel& kernel, size_t size, uint32_t alignment) + : _kernel(kernel), + _size(size), + _alignment(alignment) { if (_size || _alignment) { if (_size && _alignment == 1) { _kernel.sub(_kernel.rsp, _size); } else { auto tmp = _kernel.var(); tmp = _kernel.rsp; - _kernel.sub(_kernel.rsp, sizeof(size_t) + size); // allocate - _kernel.and_(_kernel.rsp, ~(alignment - 1)); // align - _kernel.mov(_kernel.ptr[_kernel.rsp + size], tmp); // remember previous rsp + _kernel.sub(_kernel.rsp, sizeof(size_t) + size); // allocate + _kernel.and_(_kernel.rsp, ~(alignment - 1)); // align + _kernel.mov(_kernel.ptr[_kernel.rsp + size], tmp); // remember previous rsp } } } -stack_frame::stack_frame(stack_frame && rhs) - : _kernel(rhs._kernel) - , _size(rhs._size) - , _alignment(rhs._alignment) { +stack_frame::stack_frame(stack_frame&& rhs) : _kernel(rhs._kernel), _size(rhs._size), _alignment(rhs._alignment) { rhs._size = 0; rhs._alignment = 0; } @@ -192,25 +274,29 @@ stack_frame::~stack_frame() { } } -const Xbyak::Reg64 & stack_frame::pointer() const { +const Xbyak::Reg64& stack_frame::pointer() const { return _kernel.rsp; } void stack_frame::clear() const { const size_t end = _size & ~(size_t)7u; - _kernel.foreach(0, end, [&](const Reg64 & idx) { - _kernel.mov(_kernel.qword[pointer() + idx], 0); - }, sizeof(size_t)); + _kernel.foreach ( + 0, + end, + [&](const Reg64& idx) { + _kernel.mov(_kernel.qword[pointer() + idx], 0); + }, + sizeof(size_t)); if (end < _size) { - _kernel.foreach(end, _size, [&](const Reg64 & idx) { + _kernel.foreach (end, _size, [&](const Reg64& idx) { _kernel.mov(_kernel.byte[pointer() + idx], 0); }); } } -const void * consts_table::store(const void *data, size_t size) { +const void* consts_table::store(const void* data, size_t size) { if (size > chunk_size) throw std::runtime_error("Data size is too large"); const size_t capacity = _chunks.size() * chunk_size; @@ -218,17 +304,16 @@ const void * consts_table::store(const void *data, size_t size) { _size = _chunks.size() * chunk_size; _chunks.emplace_back(); } - auto & dst = _chunks.back(); + auto& dst = _chunks.back(); const size_t offset = _size % chunk_size; memcpy(&dst[offset], data, size); _size += size; return &dst[offset]; } -} // namespace internal +} // namespace internal -jit_kernel::jit_kernel(const char* name) - : jit_generator(name) { +jit_kernel::jit_kernel(const char* name) : jit_generator(name) { _free_rmmregs.reserve(16); _free_rmmregs.reserve(16); @@ -239,73 +324,73 @@ jit_kernel::jit_kernel(const char* name) } } -template<> -const Reg64 & jit_kernel::reserve() { +template <> +const Reg64& jit_kernel::reserve() { return reserveReg(_free_x64regs, x64regs()); } -template<> -const Reg32 & jit_kernel::reserve() { +template <> +const Reg32& jit_kernel::reserve() { return reserveReg(_free_x64regs, x32regs()); } -template<> -const Reg16 & jit_kernel::reserve() { +template <> +const Reg16& jit_kernel::reserve() { return reserveReg(_free_x64regs, x16regs()); } -template<> -const Reg8 & jit_kernel::reserve() { +template <> +const Reg8& jit_kernel::reserve() { return reserveReg(_free_x64regs, x8regs()); } -template<> -void jit_kernel::free(const Reg64 & reg) { +template <> +void jit_kernel::free(const Reg64& reg) { freeReg(_free_x64regs, x64regs(), reg); } -template<> -void jit_kernel::free(const Reg32 & reg) { +template <> +void jit_kernel::free(const Reg32& reg) { freeReg(_free_x64regs, x32regs(), reg); } -template<> -void jit_kernel::free(const Reg16 & reg) { +template <> +void jit_kernel::free(const Reg16& reg) { freeReg(_free_x64regs, x16regs(), reg); } -template<> -void jit_kernel::free(const Reg8 & reg) { +template <> +void jit_kernel::free(const Reg8& reg) { freeReg(_free_x64regs, x8regs(), reg); } -template<> -const Xmm & jit_kernel::reserve() { +template <> +const Xmm& jit_kernel::reserve() { return reserveReg(_free_rmmregs, xmmregs()); } -template<> -void jit_kernel::free(const Xmm & reg) { +template <> +void jit_kernel::free(const Xmm& reg) { freeReg(_free_rmmregs, xmmregs(), reg); } -template<> -const Ymm & jit_kernel::reserve() { +template <> +const Ymm& jit_kernel::reserve() { return reserveReg(_free_rmmregs, ymmregs()); } -template<> -void jit_kernel::free(const Ymm & reg) { +template <> +void jit_kernel::free(const Ymm& reg) { freeReg(_free_rmmregs, ymmregs(), reg); } -template<> -const Zmm & jit_kernel::reserve() { +template <> +const Zmm& jit_kernel::reserve() { return reserveReg(_free_rmmregs, zmmregs()); } -template<> -void jit_kernel::free(const Zmm & reg) { +template <> +void jit_kernel::free(const Zmm& reg) { freeReg(_free_rmmregs, zmmregs(), reg); } @@ -317,26 +402,33 @@ void jit_kernel::postamble() { } } -const AddressFrame & jit_kernel::address_frame(size_t size) const { - switch (size) { - case 1: return byte; - case 2: return word; - case 4: return dword; - case 8: return qword; - case 16: return xword; - case 32: return yword; - case 64: return zword; - default: - break; - } - return ptr; +const AddressFrame& jit_kernel::address_frame(size_t size) const { + switch (size) { + case 1: + return byte; + case 2: + return word; + case 4: + return dword; + case 8: + return qword; + case 16: + return xword; + case 32: + return yword; + case 64: + return zword; + default: + break; + } + return ptr; } -const jit_kernel::reg_indices & jit_kernel::free_x64regs() const { +const jit_kernel::reg_indices& jit_kernel::free_x64regs() const { return _free_x64regs; } -const jit_kernel::reg_indices & jit_kernel::free_rmmregs() const { +const jit_kernel::reg_indices& jit_kernel::free_rmmregs() const { return _free_rmmregs; } @@ -386,5 +478,5 @@ void jit_kernel::uni_vblendps(const Xbyak::Zmm& z1, const Xbyak::Zmm& z2, uint16 vblendmps(z1 | k1, z1, z2); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp index 8934bf5dff052b..0073ca91d0b76f 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp @@ -3,14 +3,15 @@ // #pragma once -#include "cpu/x64/jit_generator.hpp" -#include "emitters/plugin/x64/jit_load_store_emitters.hpp" +#include #include -#include #include -#include -#include #include +#include +#include + +#include "cpu/x64/jit_generator.hpp" +#include "emitters/plugin/x64/jit_load_store_emitters.hpp" namespace ov { namespace intel_cpu { @@ -19,113 +20,103 @@ struct jit_kernel; namespace internal { -template +template struct reg_traits_by_size; -template +template struct reg_traits; -template +template struct reg_traits; -template +template struct isa_traits; -template<> +template <> struct reg_traits_by_size<1> { using type = Xbyak::Reg8; - constexpr static size_t size = 1; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 1; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<2> { using type = Xbyak::Reg16; - constexpr static size_t size = 2; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 2; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<4> { using type = Xbyak::Reg32; - constexpr static size_t size = 4; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 4; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<8> { using type = Xbyak::Reg64; - constexpr static size_t size = 8; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 8; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<16> { using type = Xbyak::Xmm; - constexpr static size_t size = 16; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::sse41; + constexpr static size_t size = 16; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::sse41; }; -template<> +template <> struct reg_traits_by_size<32> { using type = Xbyak::Ymm; - constexpr static size_t size = 32; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::avx2; + constexpr static size_t size = 32; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::avx2; }; -template<> +template <> struct reg_traits_by_size<64> { using type = Xbyak::Zmm; - constexpr static size_t size = 64; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::avx512_core; + constexpr static size_t size = 64; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::avx512_core; }; -template +template struct reg_traits : public reg_traits_by_size {}; -template +template struct vec_min_size { - constexpr static size_t size = N <= 16 ? 16 : - N <= 32 ? 32 : - 64; + constexpr static size_t size = N <= 16 ? 16 : N <= 32 ? 32 : 64; }; -template +template struct reg_traits : public reg_traits_by_size::size> {}; -template<> +template <> struct reg_traits { using type = Xbyak::Fpu; - constexpr static size_t size = 10; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 10; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits : public reg_traits {}; -template<> +template <> struct isa_traits { struct reg { using type = Xbyak::Xmm; - constexpr static size_t size = 4 * 4; // in bytes - constexpr static size_t length = 4; // in dwords + constexpr static size_t size = 4 * 4; // in bytes + constexpr static size_t length = 4; // in dwords }; }; -template<> +template <> struct isa_traits { struct reg { using type = Xbyak::Ymm; - constexpr static size_t size = 8 * 4; // in bytes - constexpr static size_t length = 8; // in dwords + constexpr static size_t size = 8 * 4; // in bytes + constexpr static size_t length = 8; // in dwords }; }; -template<> +template <> struct isa_traits { struct reg { using type = Xbyak::Zmm; @@ -134,39 +125,39 @@ struct isa_traits { }; }; -template +template class variable; -template +template class if_expression; -template +template class then_expression; -template +template using shared_reg = std::shared_ptr; -template -shared_reg make_shared(Reg & reg, jit_kernel & kernel); +template +shared_reg make_shared(Reg& reg, jit_kernel& kernel); -template +template class boolean_expression { public: using reg_type = const typename reg_traits::type; enum class type { - eq, // == - neq, // != - ls, // < - gt, // > - le, // <= - ge // >= + eq, // == + neq, // != + ls, // < + gt, // > + le, // <= + ge // >= }; - boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, const shared_reg & rhs); - boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, T rhs); + boolean_expression(jit_kernel& kernel, type t, const shared_reg& lhs, const shared_reg& rhs); + boolean_expression(jit_kernel& kernel, type t, const shared_reg& lhs, T rhs); private: - void cmp(const Xbyak::Label & exit) const; + void cmp(const Xbyak::Label& exit) const; - jit_kernel & _kernel; + jit_kernel& _kernel; type _type; shared_reg _lhs; shared_reg _rhs; @@ -176,33 +167,33 @@ class boolean_expression { friend class then_expression; }; -template +template class then_expression { public: - then_expression(if_expression & expr); + then_expression(if_expression& expr); - template - void _else(F && fn); + template + void _else(F&& fn); private: - if_expression & _if_expr; + if_expression& _if_expr; }; -template +template class if_expression { public: - if_expression(const boolean_expression & expr) - : _expr(expr) {} + if_expression(const boolean_expression& expr) : _expr(expr) {} ~if_expression() { try { if (!_is_exit_valid) _expr._kernel.assignL(_exit, _else); - } catch(...) {} + } catch (...) { + } } - template - then_expression _then(F && fn) { + template + then_expression _then(F&& fn) { using namespace Xbyak; _expr.cmp(_else); @@ -214,7 +205,7 @@ class if_expression { } private: - const boolean_expression & _expr; + const boolean_expression& _expr; Xbyak::Label _exit; Xbyak::Label _else; bool _is_exit_valid = false; @@ -222,287 +213,291 @@ class if_expression { friend class then_expression; }; -typedef struct register_tag {} register_tag; -typedef struct memory_tag {} memory_tag; +typedef struct register_tag { +} register_tag; +typedef struct memory_tag { +} memory_tag; -template +template class variable_base; -template +template class variable_base { public: using reg_type = const typename reg_traits::type; - variable_base & operator = (const variable_base &) = delete; + variable_base& operator=(const variable_base&) = delete; - variable_base(const variable_base &); - variable_base(variable_base &&); + variable_base(const variable_base&); + variable_base(variable_base&&); - reg_type & reg() const { - return *_reg; + reg_type& reg() const { + return *_reg; } - const shared_reg & shreg() const { + const shared_reg& shreg() const { return _reg; } - operator reg_type &() const { + operator reg_type&() const { return reg(); } - operator Xbyak::RegExp () const { + operator Xbyak::RegExp() const { return reg(); } protected: - variable_base(jit_kernel & krnl, const shared_reg & reg); + variable_base(jit_kernel& krnl, const shared_reg& reg); ~variable_base() = default; - jit_kernel & _kernel; + jit_kernel& _kernel; shared_reg _reg; }; -template +template class variable_base { public: using reg_type = const typename reg_traits::type; - variable_base & operator = (const variable_base &) = delete; + variable_base& operator=(const variable_base&) = delete; - variable_base(const variable_base &); - variable_base(variable_base &&); + variable_base(const variable_base&); + variable_base(variable_base&&); - reg_type & reg() const { - return *_addr; + reg_type& reg() const { + return *_addr; } protected: - variable_base(jit_kernel & krnl, const shared_reg & addr); + variable_base(jit_kernel& krnl, const shared_reg& addr); ~variable_base() = default; - jit_kernel & _kernel; + jit_kernel& _kernel; shared_reg _addr; }; -template -class variable : public variable_base::value, T>::type, register_tag> { +template +class variable + : public variable_base::value, T>::type, register_tag> { public: using type = T; using base = variable_base; using reg_type = const typename base::reg_type; using arithmetic_type = typename std::conditional::value, size_t, T>::type; - variable(variable &&) = default; - variable(jit_kernel & krnl); - variable(jit_kernel & krnl, const shared_reg & reg); + variable(variable&&) = default; + variable(jit_kernel& krnl); + variable(jit_kernel& krnl, const shared_reg& reg); - typename std::conditional::value - && !std::is_pointer::type>::value, - variable::type, memory_tag>, void>::type - operator *() const { + typename std::conditional::value && + !std::is_pointer::type>::value, + variable::type, memory_tag>, + void>::type + operator*() const { return variable::type, memory_tag>(base::_kernel, base::shreg()); } - const variable & operator = (reg_type & rhs) const { + const variable& operator=(reg_type& rhs) const { base::_kernel.mov(base::reg(), rhs); return *this; } - template - const variable & operator = (U *rhs) const { + template + const variable& operator=(U* rhs) const { // interpret pointers as size_t base::_kernel.mov(base::reg(), reinterpret_cast(rhs)); return *this; } - const variable & operator = (arithmetic_type rhs) const { + const variable& operator=(arithmetic_type rhs) const { base::_kernel.mov(base::reg(), static_cast(rhs)); return *this; } - const variable & operator += (reg_type & rhs) const { + const variable& operator+=(reg_type& rhs) const { base::_kernel.add(base::reg(), rhs); return *this; } - variable operator + (reg_type & rhs) const { + variable operator+(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res += rhs; return res; } - const variable & operator += (arithmetic_type rhs) const { + const variable& operator+=(arithmetic_type rhs) const { base::_kernel.add(base::reg(), rhs); return *this; } - variable operator + (arithmetic_type rhs) const { + variable operator+(arithmetic_type rhs) const { variable res(base::_kernel); res = base::reg(); res += rhs; return res; } - const variable & operator -= (reg_type & rhs) const { + const variable& operator-=(reg_type& rhs) const { base::_kernel.sub(base::reg(), rhs); return *this; } - variable operator - (reg_type & rhs) const { + variable operator-(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res -= rhs; return res; } - const variable & operator -= (arithmetic_type rhs) const { + const variable& operator-=(arithmetic_type rhs) const { base::_kernel.sub(base::reg(), rhs); return *this; } - variable operator - (arithmetic_type rhs) const { + variable operator-(arithmetic_type rhs) const { variable res(base::_kernel); res = base::reg(); res -= rhs; return res; } - const variable & operator *= (reg_type & rhs) const { + const variable& operator*=(reg_type& rhs) const { base::_kernel.imul(base::reg(), rhs); return *this; } - variable operator * (reg_type & rhs) const { + variable operator*(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res *= rhs; return res; } - const variable & operator *= (arithmetic_type rhs) const { + const variable& operator*=(arithmetic_type rhs) const { base::_kernel.imul(base::reg(), base::reg(), static_cast(rhs)); return *this; } - variable operator * (arithmetic_type rhs) const { + variable operator*(arithmetic_type rhs) const { variable res(base::_kernel); res = base::reg(); res *= rhs; return res; } - const variable & operator &= (reg_type & rhs) const { + const variable& operator&=(reg_type& rhs) const { base::_kernel.and_(base::reg(), rhs); return *this; } - variable operator & (reg_type & rhs) const { + variable operator&(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res &= rhs; return res; } - const variable & operator &= (T rhs) const { + const variable& operator&=(T rhs) const { base::_kernel.and_(base::reg(), rhs); return *this; } - variable operator & (T rhs) const { + variable operator&(T rhs) const { variable res(base::_kernel); res = base::reg(); res &= rhs; return res; } - const variable & operator |= (reg_type & rhs) const { + const variable& operator|=(reg_type& rhs) const { base::_kernel.or_(base::reg(), rhs); return *this; } - variable operator | (reg_type & rhs) const { + variable operator|(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res |= rhs; return res; } - const variable & operator |= (T rhs) const { + const variable& operator|=(T rhs) const { base::_kernel.or_(base::reg(), rhs); return *this; } - variable operator | (T rhs) const { + variable operator|(T rhs) const { variable res(base::_kernel); res = base::reg(); res |= rhs; return res; } - const variable & operator >>= (size_t rhs) const { + const variable& operator>>=(size_t rhs) const { base::_kernel.shr(base::reg(), rhs); return *this; } - variable operator >> (size_t rhs) const { + variable operator>>(size_t rhs) const { variable res(base::_kernel); res = base::reg(); res >>= rhs; return res; } - const variable & operator <<= (size_t rhs) const { + const variable& operator<<=(size_t rhs) const { base::_kernel.shl(base::reg(), rhs); return *this; } - variable operator << (size_t rhs) const { + variable operator<<(size_t rhs) const { variable res(base::_kernel); res = base::reg(); res <<= rhs; return res; } - boolean_expression operator == (const variable & rhs) const { + boolean_expression operator==(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::eq, base::shreg(), rhs.shreg()); } - boolean_expression operator == (T rhs) const { + boolean_expression operator==(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::eq, base::shreg(), rhs); } - boolean_expression operator != (const variable & rhs) const { + boolean_expression operator!=(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::neq, base::shreg(), rhs.shreg()); } - boolean_expression operator != (T rhs) const { + boolean_expression operator!=(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::neq, base::shreg(), rhs); } - boolean_expression operator < (const variable & rhs) const { + boolean_expression operator<(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ls, base::shreg(), rhs.shreg()); } - boolean_expression operator < (T rhs) const { + boolean_expression operator<(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ls, base::shreg(), rhs); } - boolean_expression operator > (const variable & rhs) const { + boolean_expression operator>(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::gt, base::shreg(), rhs.shreg()); } - boolean_expression operator > (T rhs) const { + boolean_expression operator>(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::gt, base::shreg(), rhs); } - boolean_expression operator <= (const variable & rhs) const { + boolean_expression operator<=(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::le, base::shreg(), rhs.shreg()); } - boolean_expression operator <= (T rhs) const { + boolean_expression operator<=(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::le, base::shreg(), rhs); } - boolean_expression operator >= (const variable & rhs) const { + boolean_expression operator>=(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ge, base::shreg(), rhs.shreg()); } - boolean_expression operator >= (T rhs) const { + boolean_expression operator>=(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ge, base::shreg(), rhs); } // TODO: add necessary operations }; -template +template class variable : public variable_base { public: using type = T; using base = variable_base; using reg_type = const typename base::reg_type; - variable(variable &&) = default; - variable(jit_kernel & krnl, const shared_reg & reg); + variable(variable&&) = default; + variable(jit_kernel& krnl, const shared_reg& reg); - const variable & operator = (const variable & rhs) const; + const variable& operator=(const variable& rhs) const; }; -template +template class variable : public variable_base { public: using type = T[N]; @@ -510,34 +505,34 @@ class variable : public variable_base { using reg_type = const typename base::reg_type; constexpr static size_t length = N; - variable(variable &&) = default; - variable(jit_kernel & krnl); - variable(jit_kernel & krnl, const shared_reg & reg); + variable(variable&&) = default; + variable(jit_kernel& krnl); + variable(jit_kernel& krnl, const shared_reg& reg); - const variable & operator = (reg_type & rhs) const { + const variable& operator=(reg_type& rhs) const { base::_kernel.uni_vmovups(base::reg(), rhs); return *this; } - const variable & operator = (const type & rhs) const { - const type & cref = base::_kernel.constant(rhs); + const variable& operator=(const type& rhs) const { + const type& cref = base::_kernel.constant(rhs); variable creg(base::_kernel); creg = &cref; base::_kernel.uni_vmovdqu(base::reg(), base::_kernel.ptr[creg]); return *this; } - const variable & blend(reg_type & rhs, uint16_t mask) const { + const variable& blend(reg_type& rhs, uint16_t mask) const { base::_kernel.uni_vblendps(base::reg(), rhs, mask); return *this; } - const variable & permute(const std::array & order) const { + const variable& permute(const std::array& order) const { base::_kernel.uni_vpermps(base::reg(), order.data(), base::reg()); return *this; } - const variable & permute(const uint8_t * order) const { + const variable& permute(const uint8_t* order) const { base::_kernel.uni_vpermps(base::reg(), order, base::reg()); return *this; } @@ -546,139 +541,132 @@ class variable : public variable_base { }; class stack_frame { - stack_frame(const stack_frame &) = delete; - stack_frame & operator = (const stack_frame &) = delete; + stack_frame(const stack_frame&) = delete; + stack_frame& operator=(const stack_frame&) = delete; public: - stack_frame(jit_kernel & kernel, size_t size, uint32_t alignment = 1); - stack_frame(stack_frame && rhs); + stack_frame(jit_kernel& kernel, size_t size, uint32_t alignment = 1); + stack_frame(stack_frame&& rhs); ~stack_frame(); - const Xbyak::Reg64 & pointer() const; + const Xbyak::Reg64& pointer() const; void clear() const; private: - jit_kernel & _kernel; + jit_kernel& _kernel; size_t _size; uint32_t _alignment; }; -template +template ov::element::Type type2precision(); dnnl::impl::cpu::x64::cpu_isa_t get_current_isa(); class consts_table { - consts_table(const consts_table &) = delete; - consts_table & operator = (const consts_table &) = delete; + consts_table(const consts_table&) = delete; + consts_table& operator=(const consts_table&) = delete; public: consts_table() = default; - const void * store(const void *data, size_t size); + const void* store(const void* data, size_t size); private: static constexpr const size_t chunk_size = 512; using chunk = std::array; std::list _chunks; - size_t _size {}; + size_t _size{}; }; -} // namespace internal +} // namespace internal struct jit_kernel : public dnnl::impl::cpu::x64::jit_generator { using reg_indices = std::vector; - template + template using reg_traits = internal::reg_traits; - template + template using reg_traits_by_size = internal::reg_traits_by_size; - template + template using isa_traits = internal::isa_traits; using stack_frame = internal::stack_frame; using register_tag = internal::register_tag; using memory_tag = internal::memory_tag; - template + template using variable = internal::variable; - template + template using if_expression = internal::if_expression; - template + template using boolean_expression = internal::boolean_expression; - template + template Xbyak::Address argPtr(U T::*member) const { auto memPtr = &(reinterpret_cast(0)->*member); - const size_t offs = reinterpret_cast(memPtr) - reinterpret_cast(0); + const size_t offs = reinterpret_cast(memPtr) - reinterpret_cast(0); return address_frame(sizeof(U))[param1 + offs]; } - template + template variable arg(U T::*member) { using traits = internal::reg_traits; using reg_type = typename traits::type; - const auto & res = reserve(); + const auto& res = reserve(); if (sizeof(T) < traits::size) movzx(res, argPtr(member)); else mov(res, argPtr(member)); - return { *this, internal::make_shared(res, *this) }; + return {*this, internal::make_shared(res, *this)}; } - template + template variable arg(U T::*member) { using traits = internal::reg_traits; using reg_type = typename traits::type; - const auto & res = reserve(); + const auto& res = reserve(); if (sizeof(T) < traits::size) movzx(res, argPtr(member)); else mov(res, argPtr(member)); - return { *this, internal::make_shared(res, *this) }; - } - - jit_kernel(const char *name); - - template - const RegType & reserve(); - - template - void free(const RegType & reg); - - template - void copy(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size); - template - void copy(const Xbyak::Address& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size); - - template - void load(const variable & dst, const variable & src, size_t length = N); - template - void load(const variable & dst, const variable & src, const variable & length); - template - void store(const variable & dst, const variable & src, size_t length = N); - template - void store(const variable & dst, const variable & src, const variable & length); - - template - void foreach(const B & begin, - const E & end, - std::function&)> && fn, - const S & step = 1); - - template + return {*this, internal::make_shared(res, *this)}; + } + + jit_kernel(const char* name); + + template + const RegType& reserve(); + + template + void free(const RegType& reg); + + template + void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); + template + void copy(const Xbyak::Address& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); + + template + void load(const variable& dst, const variable& src, size_t length = N); + template + void load(const variable& dst, const variable& src, const variable& length); + template + void store(const variable& dst, const variable& src, size_t length = N); + template + void store(const variable& dst, const variable& src, const variable& length); + + template + void foreach (const B& begin, const E& end, std::function&)> && fn, const S& step = 1); + + template variable var(); - template - variable var(const T & val); + template + variable var(const T& val); - template - const T & constant(const T & c); - template - const T * constant(const T * c, size_t size); + template + const T& constant(const T& c); + template + const T* constant(const T* c, size_t size); stack_frame stack(size_t size, uint32_t alignment = 1); - template - if_expression _if(const boolean_expression & expr) const; + template + if_expression _if(const boolean_expression& expr) const; void uni_vpermps(const Xbyak::Xmm& x1, const uint8_t mask[4], const Xbyak::Operand& op); void uni_vpermps(const Xbyak::Ymm& y1, const uint8_t mask[8], const Xbyak::Operand& op); @@ -689,9 +677,9 @@ struct jit_kernel : public dnnl::impl::cpu::x64::jit_generator { void postamble(); - const Xbyak::AddressFrame & address_frame(size_t size) const; - const reg_indices & free_x64regs() const; - const reg_indices & free_rmmregs() const; + const Xbyak::AddressFrame& address_frame(size_t size) const; + const reg_indices& free_x64regs() const; + const reg_indices& free_rmmregs() const; private: reg_indices _free_x64regs; @@ -703,44 +691,40 @@ struct jit_kernel : public dnnl::impl::cpu::x64::jit_generator { template <> const Xbyak::Reg64& jit_kernel::reserve(); -template -void jit_kernel::copy(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size) { - const auto & addr_frame = address_frame(sizeof(T)); +template +void jit_kernel::copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size) { + const auto& addr_frame = address_frame(sizeof(T)); auto p = reserve::type>(); - foreach(0, size, [&](const Xbyak::Reg64& idx) { + foreach (0, size, [&](const Xbyak::Reg64& idx) { mov(p, addr_frame[src + idx * sizeof(T)]); mov(addr_frame[dst + idx * sizeof(T)], p); - }); + }) + ; free(p); } -template -void jit_kernel::copy(const Xbyak::Address& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size) { - const auto & addr_frame = address_frame(sizeof(T)); +template +void jit_kernel::copy(const Xbyak::Address& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size) { + const auto& addr_frame = address_frame(sizeof(T)); auto p = reserve::type>(); auto d = reserve(); lea(d, dst); - foreach(0, size, [&](const Xbyak::Reg64& idx) { + foreach (0, size, [&](const Xbyak::Reg64& idx) { mov(p, addr_frame[src + idx * sizeof(T)]); mov(addr_frame[d + idx * sizeof(T)], p); - }); + }) + ; free(d); free(p); } -template -void jit_kernel::load(const variable & dst, const variable & src, size_t length) { +template +void jit_kernel::load(const variable& dst, const variable& src, size_t length) { static_assert(std::is_same::reg_type, const Xbyak::Reg64>::value, - "Source register must be Reg64"); + "Source register must be Reg64"); - using src_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; - using dst_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; + using src_type = typename std::remove_cv::type>::type; + using dst_type = typename std::remove_cv::type>::type; const std::vector pool_vec_idxs(_free_rmmregs.begin(), _free_rmmregs.end()); const std::vector pool_gpr_idxs(_free_x64regs.begin(), _free_x64regs.end()); @@ -752,17 +736,15 @@ void jit_kernel::load(const variable & dst, const variable & src, if (!_emitters[key]) { _emitters[key].reset(new jit_load_emitter(this, internal::get_current_isa(), src_prc, dst_prc, length)); } - _emitters[key]->emit_code( - { static_cast(static_cast(src).getIdx()) }, - { static_cast(static_cast(dst).getIdx()) }, - pool_vec_idxs, - pool_gpr_idxs); + _emitters[key]->emit_code({static_cast(static_cast(src).getIdx())}, + {static_cast(static_cast(dst).getIdx())}, + pool_vec_idxs, + pool_gpr_idxs); } -template -void jit_kernel::load(const variable & dst, const variable & src, const variable & length) { - using src_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; +template +void jit_kernel::load(const variable& dst, const variable& src, const variable& length) { + using src_type = typename std::remove_cv::type>::type; auto s = stack(N * sizeof(src_type)); s.clear(); @@ -775,15 +757,13 @@ void jit_kernel::load(const variable & dst, const variable & src, load(dst, tmp); } -template -void jit_kernel::store(const variable & dst, const variable & src, size_t length) { +template +void jit_kernel::store(const variable& dst, const variable& src, size_t length) { static_assert(std::is_same::reg_type, const Xbyak::Reg64>::value, - "Destination register must be Reg64"); + "Destination register must be Reg64"); - using src_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; - using dst_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; + using src_type = typename std::remove_cv::type>::type; + using dst_type = typename std::remove_cv::type>::type; const std::vector pool_vec_idxs(_free_rmmregs.begin(), _free_rmmregs.end()); const std::vector pool_gpr_idxs(_free_x64regs.begin(), _free_x64regs.end()); @@ -795,17 +775,15 @@ void jit_kernel::store(const variable & dst, const variable & src if (!_emitters[key]) { _emitters[key].reset(new jit_store_emitter(this, internal::get_current_isa(), src_prc, dst_prc, length)); } - _emitters[key]->emit_code( - { static_cast(static_cast(src).getIdx()) }, - { static_cast(static_cast(dst).getIdx()) }, - pool_vec_idxs, - pool_gpr_idxs); + _emitters[key]->emit_code({static_cast(static_cast(src).getIdx())}, + {static_cast(static_cast(dst).getIdx())}, + pool_vec_idxs, + pool_gpr_idxs); } -template -void jit_kernel::store(const variable & dst, const variable & src, const variable & length) { - using dst_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; +template +void jit_kernel::store(const variable& dst, const variable& src, const variable& length) { + using dst_type = typename std::remove_cv::type>::type; auto s = stack(N * sizeof(dst_type)); @@ -817,11 +795,11 @@ void jit_kernel::store(const variable & dst, const variable & src copy(dst, tmp, length); } -template -void jit_kernel::foreach(const B & begin, - const E & end, - std::function&)> && fn, - const S & step) { +template +void jit_kernel::foreach (const B& begin, + const E& end, + std::function&)> && fn, + const S& step) { using namespace Xbyak; Label loop, exit; @@ -841,36 +819,36 @@ void jit_kernel::foreach(const B & begin, L(exit); } -template +template jit_kernel::variable jit_kernel::var() { using reg_type = typename reg_traits::type; - const auto & reg = reserve(); + const auto& reg = reserve(); return variable(*this, internal::make_shared(reg, *this)); } -template -jit_kernel::variable jit_kernel::var(const T & val) { +template +jit_kernel::variable jit_kernel::var(const T& val) { using reg_type = typename reg_traits::type; - const auto & reg = reserve(); + const auto& reg = reserve(); variable res(*this, internal::make_shared(reg, *this)); res = val; return res; } -template -const T & jit_kernel::constant(const T & c) { +template +const T& jit_kernel::constant(const T& c) { auto res = _consts.store(&c, sizeof c); return *reinterpret_cast(res); } -template -const T * jit_kernel::constant(const T * c, size_t size) { +template +const T* jit_kernel::constant(const T* c, size_t size) { auto res = _consts.store(c, size * sizeof(T)); return reinterpret_cast(res); } -template -jit_kernel::if_expression jit_kernel::_if(const boolean_expression & expr) const { +template +jit_kernel::if_expression jit_kernel::_if(const boolean_expression& expr) const { return if_expression(expr); } @@ -879,12 +857,13 @@ namespace internal { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // shared_reg -template -shared_reg make_shared(Reg & reg, jit_kernel & kernel) { - std::shared_ptr ptr(®, [&kernel](Reg *preg) { +template +shared_reg make_shared(Reg& reg, jit_kernel& kernel) { + std::shared_ptr ptr(®, [&kernel](Reg* preg) { try { kernel.free(*preg); - } catch(...) {} + } catch (...) { + } }); return ptr; } @@ -892,68 +871,68 @@ shared_reg make_shared(Reg & reg, jit_kernel & kernel) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // boolean_expression -template -boolean_expression::boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, const shared_reg & rhs) - : _kernel(kernel) - , _type(t) - , _lhs(lhs) - , _rhs(rhs) - , _rvalue {} { -} - -template -boolean_expression::boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, T rhs) - : _kernel(kernel) - , _type(t) - , _lhs(lhs) - , _rvalue(rhs) { -} - -template -void boolean_expression::cmp(const Xbyak::Label & exit) const { +template +boolean_expression::boolean_expression(jit_kernel& kernel, + type t, + const shared_reg& lhs, + const shared_reg& rhs) + : _kernel(kernel), + _type(t), + _lhs(lhs), + _rhs(rhs), + _rvalue{} {} + +template +boolean_expression::boolean_expression(jit_kernel& kernel, type t, const shared_reg& lhs, T rhs) + : _kernel(kernel), + _type(t), + _lhs(lhs), + _rvalue(rhs) {} + +template +void boolean_expression::cmp(const Xbyak::Label& exit) const { if (_rhs) _kernel.cmp(*_lhs, *_rhs); else _kernel.cmp(*_lhs, _rvalue); switch (_type) { - case type::eq: { - _kernel.jne(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::neq: { - _kernel.je(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::ls: { - _kernel.jge(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::gt: { - _kernel.jle(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::le: { - _kernel.jg(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::ge: { - _kernel.jl(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } + case type::eq: { + _kernel.jne(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::neq: { + _kernel.je(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::ls: { + _kernel.jge(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::gt: { + _kernel.jle(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::le: { + _kernel.jg(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::ge: { + _kernel.jl(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } } } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // then_expression -template -then_expression::then_expression(if_expression & expr) - : _if_expr(expr) {} +template +then_expression::then_expression(if_expression& expr) : _if_expr(expr) {} -template -template -void then_expression::_else(F && fn) { +template +template +void then_expression::_else(F&& fn) { fn(); _if_expr._expr._kernel.L(_if_expr._exit); _if_expr._is_exit_valid = true; @@ -962,75 +941,57 @@ void then_expression::_else(F && fn) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // variable -template -variable_base::variable_base(jit_kernel & krnl, const shared_reg & reg) - : _kernel(krnl) - , _reg(reg) { -} +template +variable_base::variable_base(jit_kernel& krnl, const shared_reg& reg) + : _kernel(krnl), + _reg(reg) {} -template -variable_base::variable_base(const variable_base & rhs) - : _kernel(rhs._kernel) - , _reg(rhs._reg) { -} +template +variable_base::variable_base(const variable_base& rhs) : _kernel(rhs._kernel), + _reg(rhs._reg) {} -template -variable_base::variable_base(variable_base && rhs) - : _kernel(rhs._kernel) - , _reg(std::move(rhs._reg)) { -} +template +variable_base::variable_base(variable_base&& rhs) : _kernel(rhs._kernel), + _reg(std::move(rhs._reg)) {} -template -variable_base::variable_base(jit_kernel & krnl, const shared_reg & addr) - : _kernel(krnl) - , _addr(addr) { -} +template +variable_base::variable_base(jit_kernel& krnl, const shared_reg& addr) + : _kernel(krnl), + _addr(addr) {} -template -variable_base::variable_base(const variable_base & rhs) - : _kernel(rhs._kernel) - , _addr(rhs._addr) { -} +template +variable_base::variable_base(const variable_base& rhs) : _kernel(rhs._kernel), + _addr(rhs._addr) {} -template -variable_base::variable_base(variable_base && rhs) - : _kernel(rhs._kernel) - , _addr(std::move(rhs._addr)) { -} +template +variable_base::variable_base(variable_base&& rhs) : _kernel(rhs._kernel), + _addr(std::move(rhs._addr)) {} -template -variable::variable(jit_kernel & krnl) - : base(krnl, make_shared(krnl.reserve::type>(), krnl)) { -} +template +variable::variable(jit_kernel& krnl) + : base(krnl, make_shared(krnl.reserve::type>(), krnl)) {} -template -variable::variable(jit_kernel & krnl, const shared_reg & reg) - : base(krnl, reg) { -} +template +variable::variable(jit_kernel& krnl, const shared_reg& reg) : base(krnl, reg) {} -template -variable::variable(jit_kernel & krnl, const shared_reg & reg) - : base(krnl, reg) { -} +template +variable::variable(jit_kernel& krnl, const shared_reg& reg) : base(krnl, reg) {} -template -const variable & variable::operator = (const variable & rhs) const { - const auto & addr_frame = base::_kernel.address_frame(sizeof(T)); +template +const variable& variable::operator=(const variable& rhs) const { + const auto& addr_frame = base::_kernel.address_frame(sizeof(T)); base::_kernel.mov(addr_frame[base::reg()], rhs); return *this; } -template -variable::variable(jit_kernel & krnl) - : base(krnl, make_shared(krnl.reserve::type>(), krnl)) { -} +template +variable::variable(jit_kernel& krnl) + : base(krnl, make_shared(krnl.reserve::type>(), krnl)) {} -template -variable::variable(jit_kernel & krnl, const shared_reg & reg) - : base(krnl, reg) { -} +template +variable::variable(jit_kernel& krnl, const shared_reg& reg) : base(krnl, reg) {} -} // namespace internal +} // namespace internal -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp index 8fd3a966e13887..ffc0286431b279 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp @@ -10,14 +10,11 @@ namespace ov { namespace intel_cpu { namespace kernel { -JitKernelBase::JitKernelBase(const char* name, x64::cpu_isa_t isa) - : x64::jit_generator(name, isa), m_isa(isa) { +JitKernelBase::JitKernelBase(const char* name, x64::cpu_isa_t isa) : x64::jit_generator(name, isa), m_isa(isa) { vlen = x64::isa_max_vlen(isa); } -void JitKernelBase::uni_vfmsub132ps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vfmsub132ps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vfmsub132ps(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -31,9 +28,7 @@ void JitKernelBase::uni_vfmsub132ps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vfnmadd132ps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vfnmadd132ps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vfnmadd132ps(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -48,9 +43,7 @@ void JitKernelBase::uni_vfnmadd132ps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vfmsub231ps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vfmsub231ps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vfmsub231ps(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -65,9 +58,7 @@ void JitKernelBase::uni_vfmsub231ps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpaddd(const Xbyak::Ymm& v_dst, - const Xbyak::Ymm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpaddd(const Xbyak::Ymm& v_dst, const Xbyak::Ymm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpaddd(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -99,9 +90,7 @@ void JitKernelBase::uni_vpaddd(const Xbyak::Ymm& v_dst, } } -void JitKernelBase::uni_vpaddq(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpaddq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpaddq(v_dst, v_src, op); } else { @@ -112,9 +101,7 @@ void JitKernelBase::uni_vpaddq(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpsubd(const Xbyak::Ymm& v_dst, - const Xbyak::Ymm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpsubd(const Xbyak::Ymm& v_dst, const Xbyak::Ymm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpsubd(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -146,9 +133,7 @@ void JitKernelBase::uni_vpsubd(const Xbyak::Ymm& v_dst, } } -void JitKernelBase::uni_vsubpd(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vsubpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vsubpd(v_dst, v_src, op); } else { @@ -159,9 +144,7 @@ void JitKernelBase::uni_vsubpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vmulpd(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vmulpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vmulpd(v_dst, v_src, op); } else { @@ -172,9 +155,7 @@ void JitKernelBase::uni_vmulpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpmuludq(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpmuludq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpmuludq(v_dst, v_src, op); } else { @@ -185,9 +166,7 @@ void JitKernelBase::uni_vpmuludq(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vdivps(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op1, - const Xbyak::Operand& op2) { +void JitKernelBase::uni_vdivps(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op1, const Xbyak::Operand& op2) { if (isValidIsa(x64::avx)) { vdivps(v_dst, op1, op2); } else { @@ -198,9 +177,7 @@ void JitKernelBase::uni_vdivps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vdivpd(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vdivpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vdivpd(v_dst, v_src, op); } else { @@ -211,9 +188,7 @@ void JitKernelBase::uni_vdivpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vandps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& vSrs, - const Xbyak::Operand &op) { +void JitKernelBase::uni_vandps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vandps(v_dst, vSrs, op); } else { @@ -224,9 +199,7 @@ void JitKernelBase::uni_vandps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vandnps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& vSrs, - const Xbyak::Operand &op) { +void JitKernelBase::uni_vandnps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vandnps(v_dst, vSrs, op); } else { @@ -237,9 +210,9 @@ void JitKernelBase::uni_vandnps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, +void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, + const Xbyak::Reg64& rSrcPtr, + const Xbyak::Xmm& vSrcShift, const Xbyak::Opmask& kReadMask, const bool useMask, const bool zeroFill) { @@ -254,17 +227,18 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, vpgatherdd(v_dst | kReadMask, ptr[rSrcPtr + vSrcShift]); } -void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, +void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Xmm& vReadMask, + const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, const bool useMask, const bool zeroFill) { - if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { + if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || + vSrcShift.getIdx() == vReadMask.getIdx()) { OPENVINO_THROW("Any pair of the index, mask, or destination registers cannot be the same."); } if (zeroFill) - pxor(v_dst, v_dst); // Don't use vpxor. It zeros the rest of the YMM register. + pxor(v_dst, v_dst); // Don't use vpxor. It zeros the rest of the YMM register. if (isValidIsa(x64::avx2)) { if (!useMask) @@ -280,7 +254,7 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, Xbyak::Label lLoopNext; if (useMask) { uni_vpextrd(r32Aux, vReadMask, i); - cmp(r32Aux, 0); // TODO: check significant bit + cmp(r32Aux, 0); // TODO: check significant bit je(lLoopNext, T_NEAR); } uni_vpextrd(r32Aux, vSrcShift, i); @@ -292,13 +266,14 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, +void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Ymm& vReadMask, + const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, const bool useMask, const bool zeroFill) { - if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { + if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || + vSrcShift.getIdx() == vReadMask.getIdx()) { OPENVINO_THROW("Any pair of the index, mask, or destination registers cannot be the same."); } if (isValidIsa(x64::avx2)) { @@ -309,8 +284,7 @@ void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, vpgatherdd(v_dst, ptr[rSrcPtr + vSrcShift], vReadMask); } else { - Xbyak::Xmm xmmDst = Xbyak::Xmm(v_dst.getIdx()), - xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()), + Xbyak::Xmm xmmDst = Xbyak::Xmm(v_dst.getIdx()), xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()), xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()); for (uint8_t i = 0; i < 2; i++) { gatherdd(xmmDst, rSrcPtr, xmmSrcShft, xmmReadMask, useMask, zeroFill); @@ -323,7 +297,7 @@ void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, } } -void JitKernelBase::uni_vpbroadcastq(const Xbyak::Xmm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastq(const Xbyak::Xmm& x, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpbroadcastq(x, op); } else { @@ -332,7 +306,7 @@ void JitKernelBase::uni_vpbroadcastq(const Xbyak::Xmm &x, const Xbyak::Operand & } } -void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm& x, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpbroadcastd(x, op); } else if (isValidIsa(x64::avx)) { @@ -348,7 +322,7 @@ void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand & } } -void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm& x, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpbroadcastd(x, op); } else { @@ -375,8 +349,7 @@ void JitKernelBase::uni_vroundpd(const Xbyak::Xmm& v_dst, const Xbyak::Operand& } } -void JitKernelBase::uni_vcvtdq2pd(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vcvtdq2pd(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vcvtdq2pd(v_dst, op); } else { @@ -384,8 +357,7 @@ void JitKernelBase::uni_vcvtdq2pd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vcvtpd2dq(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vcvtpd2dq(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vcvtpd2dq(v_dst, op); } else { @@ -393,8 +365,7 @@ void JitKernelBase::uni_vcvtpd2dq(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpmovzxdq(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpmovzxdq(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpmovzxdq(v_dst, op); } else { @@ -416,8 +387,7 @@ void JitKernelBase::uni_vshufpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask, - const Xbyak::Reg64& rWorkRest) { +void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask, const Xbyak::Reg64& rWorkRest) { auto rOnes = getReg64(); mov(rOnes, 0xFFFFFFFFFFFFFFFF); @@ -493,11 +463,11 @@ void JitKernelBase::fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, L(lEnd); } -void JitKernelBase::load(const Xbyak::Xmm& v_dst, +void JitKernelBase::load(const Xbyak::Xmm& v_dst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFilling) { + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFilling) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not load data with type size ", typeSize); } @@ -523,11 +493,11 @@ void JitKernelBase::load(const Xbyak::Xmm& v_dst, L(lEnd); } -void JitKernelBase::load(const Xbyak::Ymm& v_dst, +void JitKernelBase::load(const Xbyak::Ymm& v_dst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFilling) { + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFilling) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not load data with type size ", typeSize); } @@ -564,9 +534,9 @@ void JitKernelBase::load(const Xbyak::Ymm& v_dst, } void JitKernelBase::store(const Xbyak::Address& dstAddr, - const Xbyak::Xmm& v_src, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize) { + const Xbyak::Xmm& v_src, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not store data with type size ", typeSize); } @@ -592,9 +562,9 @@ void JitKernelBase::store(const Xbyak::Address& dstAddr, } void JitKernelBase::store(const Xbyak::Address& dstAddr, - const Xbyak::Ymm& v_src, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize) { + const Xbyak::Ymm& v_src, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not store data with type size ", typeSize); } @@ -631,11 +601,11 @@ void JitKernelBase::store(const Xbyak::Address& dstAddr, void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Xmm& vReadMask, - const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, + const Xbyak::Xmm& vSrcShift, const Xbyak::Reg64& rToStoreNum, - const bool useMask, - const bool zeroFill) { + const bool useMask, + const bool zeroFill) { Xbyak::Label lEnd; auto rAux = getReg64(); Xbyak::Reg32 r32Aux = Xbyak::Reg32(rAux.getIdx()); @@ -671,11 +641,11 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Ymm& vReadMask, - const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, + const Xbyak::Ymm& vSrcShift, const Xbyak::Reg64& rToStoreNum, - const bool useMask, - const bool zeroFill) { + const bool useMask, + const bool zeroFill) { Xbyak::Label lEnd; if (isValidIsa(x64::avx2)) { auto vAux = RegistersPool::Reg(registersPool); @@ -684,8 +654,7 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, } else if (isValidIsa(x64::avx)) { const uint8_t typeSize = sizeof(int); const uint8_t elPerXmm = x64::cpu_isa_traits::vlen / typeSize; - Xbyak::Xmm xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()), - xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()); + Xbyak::Xmm xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()), xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()); for (uint8_t i = 0; i < 2; i++) { memMovDD(rDst, rSrc, xmmReadMask, xmmSrcShft, rToStoreNum, useMask, zeroFill); @@ -707,6 +676,6 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, L(lEnd); } -} // namespace kernel -} // namespace intel_cpu -} // namespace ov +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp index 260d7196331a7f..eee4ff4d8c0708 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp @@ -7,9 +7,9 @@ #include "openvino/core/visibility.hpp" #if defined(OPENVINO_ARCH_X86_64) -#include "cpu/x64/jit_generator.hpp" -#include "registers_pool.hpp" -#endif // OPENVINO_ARCH_X86_64 +# include "cpu/x64/jit_generator.hpp" +# include "registers_pool.hpp" +#endif // OPENVINO_ARCH_X86_64 namespace ov { namespace intel_cpu { @@ -19,18 +19,22 @@ class JitKernelBase; #if defined(OPENVINO_ARCH_X86_64) -#define getReg64() RegistersPool::Reg(registersPool) -#define getReg32() RegistersPool::Reg(registersPool) -#define getVmm() RegistersPool::Reg(registersPool) -#define getMask() RegistersPool::Reg(registersPool) +# define getReg64() RegistersPool::Reg(registersPool) +# define getReg32() RegistersPool::Reg(registersPool) +# define getVmm() RegistersPool::Reg(registersPool) +# define getMask() RegistersPool::Reg(registersPool) -class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { +class JitKernelBase : public dnnl::impl::cpu::x64::jit_generator { public: JitKernelBase(const char* name, dnnl::impl::cpu::x64::cpu_isa_t max_cpu_isa); - dnnl::impl::cpu::x64::cpu_isa_t getIsa() { return m_isa; } + dnnl::impl::cpu::x64::cpu_isa_t getIsa() { + return m_isa; + } - size_t getVectorLen() { return vlen; } + size_t getVectorLen() { + return vlen; + } void uni_vfmsub132ps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc, const Xbyak::Operand& op); @@ -62,9 +66,9 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { void uni_vdivpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op2); - void uni_vandps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand &op); + void uni_vandps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op); - void uni_vandnps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand &op); + void uni_vandnps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op); void uni_kmovd(const Xbyak::Opmask& kDst, const Xbyak::Opmask& kSrc) { kmovd(kDst, kSrc); @@ -82,11 +86,11 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { uni_vandps(kDst, kSrc1, kSrc2); } - void uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op); + void uni_vpbroadcastd(const Xbyak::Xmm& x, const Xbyak::Operand& op); - void uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op); + void uni_vpbroadcastd(const Xbyak::Ymm& x, const Xbyak::Operand& op); - void uni_vpbroadcastq(const Xbyak::Xmm &x, const Xbyak::Operand &op); + void uni_vpbroadcastq(const Xbyak::Xmm& x, const Xbyak::Operand& op); void uni_vroundpd(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op, const uint8_t imm); @@ -98,76 +102,71 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { void uni_vshufpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_srs, const Xbyak::Operand& op, uint8_t imm); - void gatherdd(const Xbyak::Xmm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, + void gatherdd(const Xbyak::Xmm& vDst, + const Xbyak::Reg64& rSrcPtr, + const Xbyak::Xmm& vSrcShift, const Xbyak::Opmask& kReadMask, - const bool useMask = true, - const bool zeroFill = false); + const bool useMask = true, + const bool zeroFill = false); - void gatherdd(const Xbyak::Xmm& vDst, + void gatherdd(const Xbyak::Xmm& vDst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Xmm& vReadMask, - const bool useMask = true, + const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, + const bool useMask = true, const bool zeroFill = false); - void gatherdd(const Xbyak::Ymm& vDst, + void gatherdd(const Xbyak::Ymm& vDst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Ymm& vReadMask, - const bool useMask = true, + const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, + const bool useMask = true, const bool zeroFill = false); - void fillRestWorkMask(const Xbyak::Opmask& kDstMask, - const Xbyak::Reg64& rWorkRest); + void fillRestWorkMask(const Xbyak::Opmask& kDstMask, const Xbyak::Reg64& rWorkRest); - void fillRestWorkMask(const Xbyak::Xmm& ymmDstMask, - const Xbyak::Reg64& rWorkRest, - const uint64_t typeSize = 4); + void fillRestWorkMask(const Xbyak::Xmm& ymmDstMask, const Xbyak::Reg64& rWorkRest, const uint64_t typeSize = 4); - void fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, - const Xbyak::Reg64& rWorkRest, - const uint64_t typeSize = 4); + void fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, const Xbyak::Reg64& rWorkRest, const uint64_t typeSize = 4); - void load(const Xbyak::Xmm& vDst, + void load(const Xbyak::Xmm& vDst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, const bool zeroFill = false); - void load(const Xbyak::Ymm& vDst, + void load(const Xbyak::Ymm& vDst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, const bool zeroFill = false); void store(const Xbyak::Address& dstAddr, - const Xbyak::Xmm& vSrc, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize); + const Xbyak::Xmm& vSrc, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize); void store(const Xbyak::Address& dstAddr, - const Xbyak::Ymm& vSrc, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize); + const Xbyak::Ymm& vSrc, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize); // Makes gather from memory under the vReadMask and writes to the memory m128. void memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Xmm& vReadMask, - const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, + const Xbyak::Xmm& vSrcShift, const Xbyak::Reg64& rToStoreCounter, - const bool useMask = true, + const bool useMask = true, const bool zeroFill = false); // Makes gather from the memory under the vReadMask and writes to the memory m256. void memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Ymm& vReadMask, - const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, + const Xbyak::Ymm& vSrcShift, const Xbyak::Reg64& rToStoreCounter, - const bool useMask = true, + const bool useMask = true, const bool zeroFill = false); protected: @@ -181,32 +180,37 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { enum { // Comparison predicate operand (immediate byte) for single-precision floating-point values. - CMP_EQ_PS = 0, // Equal (ordered, non-signaling) - CMP_LT_PS, // Less-than (ordered, signaling) - CMP_LE_PS, // Less-than-or-equal (ordered, signaling) - CMP_UNORD_PS, // Unordered (non-signaling) - CMP_NEQ_PS, // Not-equal (unordered, non-signaling) - CMP_NLT_PS, // Not-less-than (unordered, signaling) - CMP_NLE_PS, // Not-less-than-or-equal (unordered, signaling) - CMP_ORD_PS // Ordered (non-signaling) + CMP_EQ_PS = 0, // Equal (ordered, non-signaling) + CMP_LT_PS, // Less-than (ordered, signaling) + CMP_LE_PS, // Less-than-or-equal (ordered, signaling) + CMP_UNORD_PS, // Unordered (non-signaling) + CMP_NEQ_PS, // Not-equal (unordered, non-signaling) + CMP_NLT_PS, // Not-less-than (unordered, signaling) + CMP_NLE_PS, // Not-less-than-or-equal (unordered, signaling) + CMP_ORD_PS // Ordered (non-signaling) }; }; -template +template class JitKernel : public JitKernelBase { public: - using KernelFunc = void (*)(const CallArgs *); + using KernelFunc = void (*)(const CallArgs*); explicit JitKernel(const char* name, const CompileParams& jcp, dnnl::impl::cpu::x64::cpu_isa_t max_cpu_isa) - : JitKernelBase{name, max_cpu_isa}, m_jcp{jcp}, m_func{nullptr} {} + : JitKernelBase{name, max_cpu_isa}, + m_jcp{jcp}, + m_func{nullptr} {} ~JitKernel() override = default; dnnl::impl::status_t create_kernel() override { const dnnl::impl::status_t code = jit_generator::create_kernel(); if (code != dnnl::impl::status::success) { - OPENVINO_THROW("Could not create kernel. Error code: ", std::to_string(code), ". ", - "Xbyak error code: ", Xbyak::ConvertErrorToString(Xbyak::GetError())); + OPENVINO_THROW("Could not create kernel. Error code: ", + std::to_string(code), + ". ", + "Xbyak error code: ", + Xbyak::ConvertErrorToString(Xbyak::GetError())); } m_func = (decltype(m_func))jit_ker(); return code; @@ -221,21 +225,21 @@ class JitKernel : public JitKernelBase { this->operator()(&args); } - template class KernelT> + template